In [4]:
import numpy as np 
import pandas as pd

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving kiva_loans.csv to kiva_loans.csv
User uploaded file "kiva_loans.csv" with length 195852823 bytes


1. Data cleaning / processing / language parsing
1. Create features using two different NLP methods: For example, BoW vs tf-idf.
1. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
1. Assess your models using cross-validation and determine whether one model performed better.
1. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [5]:
raw = pd.read_csv('kiva_loans.csv')
raw.head(5)

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
0,653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:12:39+00:00,2013-12-17 08:00:00+00:00,2014-01-02 10:06:32+00:00,12.0,12,,female,irregular,2014-01-01
1,653053,575.0,575.0,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:51:08+00:00,2013-12-17 08:00:00+00:00,2014-01-02 09:17:23+00:00,11.0,14,,"female, female",irregular,2014-01-01
2,653068,150.0,150.0,Transportation,Transportation,To repair their old cycle-van and buy another ...,IN,India,Maynaguri,INR,334.0,2014-01-01 09:58:07+00:00,2013-12-17 08:00:00+00:00,2014-01-01 16:01:36+00:00,43.0,6,"user_favorite, user_favorite",female,bullet,2014-01-01
3,653063,200.0,200.0,Embroidery,Arts,to purchase an embroidery machine and a variet...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 08:03:11+00:00,2013-12-24 08:00:00+00:00,2014-01-01 13:00:00+00:00,11.0,8,,female,irregular,2014-01-01
4,653084,400.0,400.0,Milk Sales,Food,to purchase one buffalo.,PK,Pakistan,Abdul Hakeem,PKR,245.0,2014-01-01 11:53:19+00:00,2013-12-17 08:00:00+00:00,2014-01-01 19:18:51+00:00,14.0,16,,female,monthly,2014-01-01


In [0]:
#Goal: examine language differences between genders

In [7]:
#Let's examine the possibilities
raw.groupby('borrower_genders')['use'].agg('count')

borrower_genders
female                                                                                                                                                                                                                                            426497
female, female                                                                                                                                                                                                                                     12163
female, female, female                                                                                                                                                                                                                             11676
female, female, female, female                                                                                                                                                                                                              

In [8]:
#What a mess. Let's consolidate:
raw['borrower_genders'] = raw['borrower_genders'].astype('str') 
feat = []
for i in raw.iterrows():
    val = i[1][17]
    if ('female' in val) & (', male' in val):
        feat.append('both')
    elif 'female' in val:
        feat.append('female')
    elif ('female' not in val) & ('male' in val):
        feat.append('male')
    else:
        feat.append(val)
raw['borr_gender'] = feat
raw.groupby('borr_gender')['use'].agg('count')

borr_gender
both       36241
female    492212
male      138520
nan            0
Name: use, dtype: int64

In [9]:
df = raw[['id', 'use', 'borr_gender']]
del raw
df = df[(df.borr_gender == 'female') | (df.borr_gender == 'male')]
#Reduce size
df = df.sample(20000)
print(df.shape)
df['borr_gender'].unique()

(20000, 3)


array(['male', 'female'], dtype=object)

In [10]:
df.head()

Unnamed: 0,id,use,borr_gender
462253,1122142,to pay for his upcoming semester fees,male
547042,1208105,to pay for children's school fees,male
474336,1134293,"to buy materials (iron, sheets of plywood).",male
301171,948414,"to buy bricks, iron rods, cement, sand and gra...",female
175907,1086712,purchase a bag for keeping my stock,female


# BOW Feature Generation

In [0]:
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

In [0]:
nlp = spacy.load('en', parser=False, entity=False)

In [15]:
df = df.dropna(how='any')
df['tokens'] = df['use'].apply(lambda y: nlp(y))
df = df.reset_index()
df.head()

Unnamed: 0,index,id,use,borr_gender,tokens
0,462253,1122142,to pay for his upcoming semester fees,male,"(to, pay, for, his, upcoming, semester, fees)"
1,547042,1208105,to pay for children's school fees,male,"(to, pay, for, children, 's, school, fees)"
2,474336,1134293,"to buy materials (iron, sheets of plywood).",male,"(to, buy, materials, (, iron, ,, sheets, of, p..."
3,301171,948414,"to buy bricks, iron rods, cement, sand and gra...",female,"(to, buy, bricks, ,, iron, rods, ,, cement, ,,..."
4,175907,1086712,purchase a bag for keeping my stock,female,"(purchase, a, bag, for, keeping, my, stock)"


In [0]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text, n):
    allwords=[]
    for _ in text:    
        # Filter out punctuation and stop words.
        allwords.append([token.lemma_
                    for token in _
                    if not token.is_punct
                    and not token.is_stop])
    allwords = [item for sublist in allwords for item in sublist]    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(n)]
    

In [0]:
twoKwords = bag_of_words(df.tokens, 2000)

In [18]:
list(twoKwords)[:10]

['buy',
 'purchase',
 'sell',
 'business',
 'pay',
 'fertilizer',
 'supply',
 'to',
 'stock',
 'water']

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(vocabulary=twoKwords)

In [21]:
X = vectorizer.fit_transform(df['use'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
del df2
word_counts.head()

(19998, 2005)


Unnamed: 0,index,id,use,borr_gender,tokens,buy,purchase,sell,business,pay,...,rag,remainder,fair,maximize,success,awning,catalogue,barbed,grout,pour
0,462253,1122142,to pay for his upcoming semester fees,male,"(to, pay, for, his, upcoming, semester, fees)",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,547042,1208105,to pay for children's school fees,male,"(to, pay, for, children, 's, school, fees)",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,474336,1134293,"to buy materials (iron, sheets of plywood).",male,"(to, buy, materials, (, iron, ,, sheets, of, p...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,301171,948414,"to buy bricks, iron rods, cement, sand and gra...",female,"(to, buy, bricks, ,, iron, rods, ,, cement, ,,...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,175907,1086712,purchase a bag for keeping my stock,female,"(purchase, a, bag, for, keeping, my, stock)",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Try Random Forest

In [0]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score

In [23]:
rfc = ensemble.RandomForestClassifier(n_estimators=25)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.9555759293215536

Test set score: [0.78410795 0.7711928  0.78432108]


# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', cross_val_score(lr,X_test, y_test, cv=3))

(11998, 1999) (11998,)
Training set score: 0.8378896482747125

Test set score: [0.77848576 0.78244561 0.78844711]


Logistic Regression appears to be doing better. 
I will not try to improve rfc performance by 5%

In [25]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=100)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.9084014002333722

Test set score: [0.79347826 0.78657164 0.78807202]


In [26]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=20)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.8078013002167028

Test set score: [0.78148426 0.77831958 0.78544636]


In [0]:
#That's not helping, let's try including more features...

In [28]:
threeKwords = bag_of_words(df.tokens, 3000)
vectorizer = CountVectorizer(vocabulary=threeKwords)

X = vectorizer.fit_transform(df['use'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
del df2
word_counts.head()

(19998, 3005)


Unnamed: 0,index,id,use,borr_gender,tokens,buy,purchase,sell,business,pay,...,recovery,woodcutt,ma,suman,5.22,blood,pain,judite,oman,100-kilo
0,462253,1122142,to pay for his upcoming semester fees,male,"(to, pay, for, his, upcoming, semester, fees)",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,547042,1208105,to pay for children's school fees,male,"(to, pay, for, children, 's, school, fees)",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,474336,1134293,"to buy materials (iron, sheets of plywood).",male,"(to, buy, materials, (, iron, ,, sheets, of, p...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,301171,948414,"to buy bricks, iron rods, cement, sand and gra...",female,"(to, buy, bricks, ,, iron, rods, ,, cement, ,,...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,175907,1086712,purchase a bag for keeping my stock,female,"(purchase, a, bag, for, keeping, my, stock)",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=100)
Y = word_counts['borr_gender']
X = np.array(word_counts.drop(['index','id','use','borr_gender','tokens'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.9046507751291882

Test set score: [0.78710645 0.78094524 0.79407352]


In [30]:
rfc = ensemble.RandomForestClassifier(n_estimators=200,
                                     max_depth=100)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.9101516919486581

Test set score: [0.7946027  0.78657164 0.79219805]


In [31]:
#One more performance intensive try...
rfc = ensemble.RandomForestClassifier(n_estimators=250,
                                     max_depth=None)
train = rfc.fit(X_train, y_train)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', cross_val_score(rfc,X_test, y_test, cv=3))

Training set score: 0.9606601100183364

Test set score: [0.78523238 0.7768192  0.78769692]


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than .X of the paragraphs
                             min_df=4, # only use words that appear at least n times
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [34]:
use_tfidf=vectorizer.fit_transform(df['use'])
print("Number of features: %d" % use_tfidf.get_shape()[1])


Number of features: 2024


In [0]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [36]:
#Our SVD data reducer.  We are going to reduce the feature space
svd= TruncatedSVD(200)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_lsa = lsa.fit_transform(use_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_lsa,index=df.use)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 61.55840481159919
Component 0:
use
to purchase additional groceries to sell.    0.680566
to purchase additional groceries to sell.    0.680566
to purchase additional groceries to sell.    0.680566
to purchase additional groceries to sell.    0.680566
to purchase additional groceries to sell.    0.680566
to purchase additional groceries to sell.    0.680566
to purchase shawls to sell.                  0.671406
to purchase yellow eels to sell.             0.669429
to purchase offal and grease to sell.        0.669152
to purchase more crabs to sell.              0.669132
Name: 0, dtype: float64
Component 1:
use
 To buy a water filter to provide safe drinking water for their family.\t     0.983172
To buy a water filter to provide safe drinking water for their family.\t\t    0.983172
to buy a water filter to provide safe drinking water for their family.        0.983172
To buy a water filter to provide safe drinking water for their family.        

In [37]:
# Random Forest attempt
rfc = ensemble.RandomForestClassifier(n_estimators=25,
                                     max_depth=None)
Y = df['borr_gender']
X = X_lsa

train = rfc.fit(X, Y)
print('\nTest set score:', cross_val_score(rfc,X, Y, cv=3))


Test set score: [0.7940603  0.79417942 0.79939985]


In [38]:
# Random Forest attempt
rfc = ensemble.RandomForestClassifier(n_estimators=250,
                                     max_depth=100)
Y = df['borr_gender']
X = X_lsa

train = rfc.fit(X, Y)
print('\nTest set score:', cross_val_score(rfc,X, Y, cv=3))


Test set score: [0.79781011 0.80078008 0.79579895]


In [39]:
lr = LogisticRegression(solver='liblinear')
train = lr.fit(X, Y)
print('\nCV score:', cross_val_score(lr,X, Y, cv=3))


CV score: [0.79271036 0.79807981 0.80435109]


Both of these methods seem to be convergin on .8 in cross validation. 