In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

1. Data cleaning / processing / language parsing
1. Create features using two different NLP methods: For example, BoW vs tf-idf.
1. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
1. Assess your models using cross-validation and determine whether one model performed better.
1. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [None]:
raw = pd.read_csv('../input/kiva_loans.csv')
raw.head(5)

In [None]:
#Goal: examine language differences between genders

In [None]:
#Let's examine the possibilities
raw.groupby('borrower_genders')['use'].agg('count')

In [None]:
#What a mess. Let's consolidate:
raw['borrower_genders'] = raw['borrower_genders'].astype('str') 
feat = []
for i in raw.iterrows():
    val = i[1][17]
    if ('female' in val) & (', male' in val):
        feat.append('both')
    elif 'female' in val:
        feat.append('female')
    elif ('female' not in val) & ('male' in val):
        feat.append('male')
    else:
        feat.append(val)
raw['borr_gender'] = feat
raw.groupby('borr_gender')['use'].agg('count')

In [None]:
df = raw[['id', 'use', 'borr_gender']]
del raw
df = df[(df.borr_gender == 'female') | (df.borr_gender == 'male')]
#Reduce size
df = df.sample(20000)
print(df.shape)
df['borr_gender'].unique()

In [None]:
df.head()

# BOW Feature Generation

In [None]:
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

In [None]:
nlp = spacy.load('en', parser=False, entity=False)

In [None]:
df['tokens'] = df['use'].apply(lambda y: nlp(y))
df = df.reset_index()
df.head()

In [None]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text, n):
    allwords=[]
    for _ in text:    
        # Filter out punctuation and stop words.
        allwords.append([token.lemma_
                    for token in _
                    if not token.is_punct
                    and not token.is_stop])
    allwords = [item for sublist in allwords for item in sublist]    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(n)]
    

In [None]:
twoKwords = bag_of_words(df.tokens, 2000)

In [None]:
list(twoKwords)[:10]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(vocabulary=twoKwords)

In [None]:
X = vectorizer.fit_transform(df['use'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
del df2
word_counts.head()

# Try Random Forest

In [None]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
rfc = ensemble.RandomForestClassifier(n_estimators=25)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', cross_val_score(lr,X_test, y_test, cv=3))

Logistic Regression appears to be doing better. 
I will not try to improve rfc performance by 5%

In [None]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=100)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

In [None]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=20)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

In [None]:
#That's not helping, let's try including more features...

In [None]:
threeKwords = bag_of_words(df.tokens, 3000)
vectorizer = CountVectorizer(vocabulary=threeKwords)

X = vectorizer.fit_transform(df['use'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
del df2
word_counts.head()

In [None]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=100)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

In [None]:
rfc = ensemble.RandomForestClassifier(n_estimators=200,
                                     max_depth=100)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

In [None]:
#One more performance intensive try...
rfc = ensemble.RandomForestClassifier(n_estimators=250,
                                     max_depth=None)
train = rfc.fit(X_train, y_train)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than .X of the paragraphs
                             min_df=4, # only use words that appear at least n times
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [None]:
use_tfidf=vectorizer.fit_transform(df['use'])
print("Number of features: %d" % use_tfidf.get_shape()[1])


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [None]:
#Our SVD data reducer.  We are going to reduce the feature space
svd= TruncatedSVD(200)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_lsa = lsa.fit_transform(use_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_lsa,index=df.use)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

In [None]:
# Random Forest attempt
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=None)
Y = df['borr_gender']
X = X_lsa

train = rfc.fit(X, Y)
print('\nTest set score:', cross_val_score(rfc,X, Y, cv=3))

In [None]:
# Random Forest attempt
rfc = ensemble.RandomForestClassifier(n_estimators=250,
                                     max_depth=100)
Y = df['borr_gender']
X = X_lsa

train = rfc.fit(X, Y)
print('\nTest set score:', cross_val_score(rfc,X, Y, cv=3))

In [None]:
lr = LogisticRegression(solver='liblinear')
train = lr.fit(X, Y)
print('\nCV score:', cross_val_score(lr,X, Y, cv=3))

Both of these methods seem to be convergin on .8 in cross validation. 