# Understanding the Problem and Data Processing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [78]:
#We first import the dataset
df_train=pd.read_csv("/Users/yfjac/OneDrive/Escritorio/Machine Learning - python/final project/train.csv")

In [79]:
#We drop empty rows
df_train.dropna(subset=['keywords'], inplace=True)
df_train.shape

(6418659, 4)

In [80]:
#initially we created a subset of 10,000 rows in order to test the code with: prueba=df_train[0:10000].copy()
#in order to not re-write the whole code, we just replaced the subset with a copy of the whole database.
prueba=df_train.copy()

In [81]:
prueba.shape

(6418659, 4)

In [94]:
import string
nopunc = [char for char in prueba if char not in string.punctuation]

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

We created a function that would split words and their frequencies, and return the words multiplied by their frequencies. We also eliminated stopwords for french language

In [4]:
def split_item(row):
    lista=[]
    for item in row:
        item = row.split(';') 
#    return item
    for i in range (0,len(item)):
        lista=lista+item[i].split(':')
    for i in range (1,len(lista)): #here I change the weight from string to int
        if i%2 !=0:
            lista[i]=int(lista[i])
    integers = [x for x in lista if isinstance(x, int)]
    myStrList = [x for x in lista if isinstance(x, str)]
    f_list = sum([[s] * n for s, n in zip(myStrList, integers)], [])
    no_number = [x for x in f_list if not any(c.isdigit() for c in x)]
    list_clean = [word for word in no_number if word.lower() not in stopwords.words('french')]
    final_text=""
    for x in list_clean:
        final_text=final_text+x.lower()+" "
    return final_text

In [85]:
#we run the function on the whole dataset
prueba["keywords"]=prueba["keywords"].map(lambda x: split_item(x))

In [100]:
#we test that the function works (eg. showing the word fiber x times on the first row)
prueba.head(5)

Given that running the function on the whole dataset required at least 12 hours to run, we decided to export the output into a csv file in order to not run the whole function every time that we needed to run the code.

From this point, we worked on this dataset to be more efficient.

In [87]:
prueba.to_csv('train_function.csv')

In [7]:
prueba=pd.read_csv("/Users/yfjac/OneDrive/Escritorio/train_function.csv")

In [9]:
prueba["sex"]=prueba["sex"].replace("M",1)
prueba["sex"]=prueba["sex"].replace("F",0)

In [10]:
prueba.head(4)

Unnamed: 0.1,Unnamed: 0,ID,keywords,age,sex
0,0,1,fibre fibre fibre fibre fibre fibre fibre fibr...,62,0
1,1,2,restaurant marrakech.shtml,35,1
2,2,3,payer faq taxe habitation macron detail progra...,45,0
3,3,4,rigaud rigaud rigaud laurent laurent laurent p...,46,0


In [16]:
#We discovered that it was necessary to drop null values from this new dataset
#The function gave back naan
prueba.dropna(subset=['keywords'], inplace=True)

In [17]:
#We kept 99.5% of the data
prueba.count()

Unnamed: 0    6390771
ID            6390771
keywords      6390771
age           6390771
sex           6390771
dtype: int64

In [18]:
#Divide into train and test subsets
from sklearn.model_selection import train_test_split
model=prueba[{"keywords","sex"}]
X_train,X_test,y_train,y_test = train_test_split(prueba["keywords"],prueba["sex"], test_size = 0.2, random_state = 42)

# Multinomial NB for gender prediction

We tested multinomial NB in order to predict the gender with 61% accuracy. Slightly better than the Random Forest Classifier.
We decided to run this model on the complete dataset due to its efficiency and good results.

In [20]:
vc_tf_idf = TfidfVectorizer()

In [21]:
#We fit our train data
vc_tf_idf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [23]:
#We transform our train data
X_train_tf = vc_tf_idf.transform(X_train)
#X_train_tf[:3].nonzero()

In [24]:
#We import and train the model
prediction = dict()
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
#We transform test data and predict the outcome with the trained model
X_test_tf = vc_tf_idf.transform(X_test)
prediction["Multinomial"] = model.predict(X_test_tf)

In [26]:
#We test accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test,prediction["Multinomial"])

0.6174564117810438

In [27]:
print("train score:", model.score(X_train_tf, y_train))
print("test score:", model.score(X_test_tf, y_test))

train score: 0.6364074673317925
test score: 0.6174564117810438


# Random Forest Classifier

We trained a Random Forest Classification model in order to predict the gender on the 10,000 row sample. The results were pretty similar to the Multinomial NB, however it required much more processing power (involving more than 6 hours to run on the whole dataset).

Below you can see the code and the results, however it was not run on the whole dataset.

In [74]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [75]:
cv2 = TfidfVectorizer()

In [108]:
#We fit/transform the training dataset
X_traincv2 = cv2.fit_transform(X_train)

In [109]:
#We define the model and the number of estimators
rfc = RandomForestClassifier(n_estimators = 50, random_state = 32)

In [None]:
#We train the model on the training dataset
rfc.fit(X_traincv2,y_train)

In [33]:
#We run the model on the testing dataset
X_testcv2 = cv2.transform(X_test)
prediction["RFC"] = rfc.predict(X_testcv2)

In [34]:
#We measure the accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test,prediction["RFC"])

0.602

In [35]:
print("rfc train score:", rfc.score(X_traincv2, y_train))
print("rfc test score:", rfc.score(X_testcv2, y_test))

rfc train score: 0.912125
rfc test score: 0.602


# Linear Regression 

We thought of a linear regression to predict age. Althought the mean squared error is not too bad, it's a little high if we consider we are estimating age, we decided to run this model on the whole dataset for it being more efficient.

We also ran a Random Forest Regressor, but were unable to run it on the whole dataset due to the processing time for the model to learn.

In [28]:
from sklearn.model_selection import train_test_split
model_age=prueba[{"keywords","age"}]
X_train_a,X_test_a,y_train_a,y_test_a = train_test_split(prueba["keywords"],prueba["age"], test_size = 0.2, random_state = 42)

In [29]:
#from sklearn.feature_extraction.text import TfidfVectorizer
vc_tf_idf_a = TfidfVectorizer()
vc_tf_idf_a.fit(X_train_a)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [95]:
X_train_atf = vc_tf_idf_a.transform(X_train)

In [31]:
#We train the model on the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  
lm_reg=regressor.fit(X_train_atf,y_train_a) #training the algorithm

In [32]:
#We test the model on the testins set
X_test_atf = vc_tf_idf_a.transform(X_test_a)
age_pred=regressor.predict(X_test_atf)

In [33]:
#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)

44.836512533495075
[ 25.67525844   4.88245405  -2.40836602 ...  35.02480582  43.12948344
 -18.31659161]


In [34]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_a, age_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_a, age_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_a, age_pred)))

Mean Absolute Error: 10.541258804267839
Mean Squared Error: 319.94526803890324
Root Mean Squared Error: 17.887013949759844


# Random Forest for Age

We were able to run a Random Forest Regressor in order to predict age with an accuracy of 72%, however, we were not able to train the model on the whole dataset due to complications with the processing time (+40 hours and still no finish).

We left te code and the results with a subset of 10,000 rows of the original database.

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor

In [44]:
from sklearn.model_selection import train_test_split
model_age=prueba[{"keywords","age"}]
X_train_a,X_test_a,y_train_a,y_test_a = train_test_split(prueba["keywords"],prueba["age"], test_size = 0.2, random_state = 42)

In [45]:
cva = TfidfVectorizer()

In [46]:
#We fit transform the training set
X_traincva = cva.fit_transform(X_train_a)

In [47]:
#Define the model estimators
rfr = RandomForestRegressor(n_estimators = 50, random_state = 42)

In [49]:
#We train the model
rfr.fit(X_traincva, y_train_a)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [50]:
#We transform the testing dataset
X_testcva = cva.transform(X_test_a)

In [51]:
#We run the model on the testing dataset
prediction["RFR"] = rfr.predict(X_testcva)

In [52]:
#We obtain general performance indicators
errors = abs(prediction["RFR"] - y_test_a)
mape = 100 * (errors / y_test_a)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 72.74 %.


# Prediction on Test Set

In order to predict the results on the test set, we decided to use: Multinomial NB for gender, and Linear Regression for age.
Similar to training dataset, first we need to process the data to prepare it for the models to run.

In [96]:
#We import the "Test" dataset
df_test = pd.read_csv('/Users/yfjac/OneDrive/Escritorio/Machine Learning - python/final project/test.csv')

In [97]:
#We drop empty rows
df_test.dropna(subset=['keywords'], inplace=True)
df_train.shape

(6418659, 5)

In [98]:
#Initially we defined prueba_t as a subset of 10,000 rows in order to test the code: prueba_t=df_test[0:10000].copy()
#In order to not re-write the code, we later defined prueba_t as a copy of the original dataset
prueba_t=df_test.copy()

In [99]:
prueba_t.head(5)

Unnamed: 0,ID,keywords,age,sex
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,
3,4,002lundu83vnndv:1,,
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,
5,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,


In [57]:
#We executed our function on the dataset to separate words and frequencies just like we did with the training dataset.
prueba_t["keywords"]=prueba_t["keywords"].map(lambda x: split_item(x))

In [58]:
prueba_t["keywords"].shape

(10000,)

Similar as we did before, we exported the result to a csv file in order to not run the function which requires 10+ hours to process the whole dataset. From this point, we used this dataset.

In [44]:
prueba_t.to_csv('test_function.csv')

In [76]:
prueba_t=pd.read_csv("/Users/yfjac/OneDrive/Escritorio/test_function.csv")

After running the function we can see that words appear many times depending on their frequencies (eg. ID 5, with high appearing 3 times).

However, we can also see that some columns are left with a NaN, representing that tere is not a valid word that can allow us to predict age and gender (eg. ID 4, that initially had 002lundu83vnndv as a word, and now appears as Nan)

In [77]:
prueba_t.head(5)

Unnamed: 0.1,Unnamed: 0,ID,keywords,age,sex
0,1,2,cecilia.gosselin flash ville obseques economie...,,
1,2,3,peut jcms acceptees beneficiaire assurances sa...,,
2,3,4,,,
3,4,5,high high high patisserie apple apple apple te...,,
4,5,6,disparition vue maelys deuxieme place actu fla...,,


In [78]:
prueba_t.count()

Unnamed: 0    2748743
ID            2748743
keywords      2736868
age                 0
sex                 0
dtype: int64

In [79]:
#Dropping again empty rows
prueba_t.dropna(subset=['keywords'], inplace=True)

In [80]:
#After dropping empty rows (Nan), we are left only with 2.7 million rows with valid data in order to run the prediction
prueba_t.count()

Unnamed: 0    2736868
ID            2736868
keywords      2736868
age                 0
sex                 0
dtype: int64

In [59]:
#test_sex=prueba_t['keywords'].copy()

In [86]:
#we transform the whole dataset
test_k = vc_tf_idf.transform(prueba_t["keywords"])

In [87]:
#we run the prediction on gender with the Multinomial NB we trained earlier with the whole training dataset
prediction["gender"] = pd.DataFrame(model.predict(test_k))
prediction['gender']

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
2736863,1
2736864,0
2736865,1
2736866,0


In [88]:
#We make sure we have no null values
prediction['gender'].isnull().sum()
prediction['gender'].count()

0    2736868
dtype: int64

In [89]:
#We predict age with the linear regression model we trained earlier with the whole training dataset
prediction["age"] = pd.DataFrame(regressor.predict(test_k))
prediction["age"]

Unnamed: 0,0
0,40.756184
1,57.357267
2,43.980407
3,45.806281
4,45.806281
...,...
2736863,48.466002
2736864,49.482571
2736865,41.444691
2736866,50.724114


In [90]:
#we integrate the resutls into the database.
prueba_t['sex_pred']=prediction['gender']
prueba_t['age_pred']=prediction['age']
prueba_t.head(5)

Unnamed: 0.1,Unnamed: 0,ID,keywords,age,sex,sex_pred,age_pred
0,1,2,cecilia.gosselin flash ville obseques economie...,,,1.0,40.756184
1,2,3,peut jcms acceptees beneficiaire assurances sa...,,,1.0,57.357267
3,4,5,high high high patisserie apple apple apple te...,,,1.0,45.806281
4,5,6,disparition vue maelys deuxieme place actu fla...,,,1.0,45.806281
5,6,7,disparition place vue garde maelys homme flash...,,,0.0,38.297183


# Exporting the outputs

In [91]:
#We export the results into a final dataset with the information required
Results = prueba_t[['ID','age_pred','sex_pred']].copy()
Results["sex_pred"]=Results["sex_pred"].replace(1,"M")
Results["sex_pred"]=Results["sex_pred"].replace(0,"F")
Results.head(5)

Unnamed: 0,ID,age_pred,sex_pred
0,2,40.756184,M
1,3,57.357267,M
3,5,45.806281,M
4,6,45.806281,M
5,7,38.297183,F


In [92]:
Results.isnull().sum()
Results.count()

ID          2736868
age_pred    2725076
sex_pred    2725076
dtype: int64

In [93]:
#We export the csv file
Results.to_csv('Results_test.csv')