In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from optparse import OptionParser
import sys
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer

PATH='data/zebestof/'

## Demographic prediction

When a user visits our websites, we collect information about keywords extracted from the website's url. For each user, the frequency of visits per word per day is also stored. 
For example, suppose that a given user has visited the two following sites today:
- html://figaro/abc-news/aaa-bbb.html html://figaro/news/aaa.html

The keywords “seen” by this user will be then stored as follows (semicolon is used to separate words):
abc:1; news:2; aaa:2; bbb:1

We have demographics information (like age, sex) on about 10% of our visitors, thanks to external data (or other sources of information).

The Head of Product wanted to predict demographics (age, sex) for the rest of our visitors from the keywords collected. He then spoke to Mr. Google, who advised him to hire a talented data scientist, in order to transform his idea into reality. And now, you understand why you are here today!

So, he's asked you to build a machine learning model, which can help us predict age and sex for each line in our dataset, which was partially extracted from one month's data (the portion of each day's data was concatenated). The dataset contains two files named train.csv (to help you train your model) and test.csv. Its format looks like: userID, keywords, age, sex (comma is used as a delimiter). Note that there are some missing data in our dataset, and we removed the ID, labels (age, sex) from the test file.

Once your model is trained, you have to use the test.csv file to test your model, and send us the results as a csv file containing only three columns: ID, age_pred, sex_pred. For example, your submission file should look like:

ID, age_pred, sex_pred 

1234,35,F

3456,45,M
...

## Observations

- 

## TO DO


- TF IDF
- 10-fold cross-validation
- find why cant convert to dict


In [3]:
!ls {PATH}

test1.csv  train1.csv


In [4]:
train_df = pd.read_csv(f'{PATH}train1.csv')
test_df = pd.read_csv(f'{PATH}test1.csv')

### Exploration and basic statistics

In [5]:
display(train_df.head(), test_df.head())

Unnamed: 0,ID,keywords,age,sex
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F
1,2,restaurant:1;marrakech.shtml:1,35,M
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45,F
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46,F
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42,F


Unnamed: 0,ID,keywords,age,sex
0,1,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,44,M
1,2,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,71,M
2,3,002lundu83vnndv:1,42,M
3,4,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,44,M
4,5,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,48,F


In [6]:
# Drop NaN
train_df = train_df.dropna(axis=0)
test_df = test_df.dropna(subset=['keywords'])

In [7]:
# By dropping NaN we lost around 800k in training and 350k in test.
(len(train_df), len(test_df))

(6418659, 2748743)

### TF - IDF
### BoW

In [8]:
train_df['class'] = train_df["age"].map(str) + train_df["sex"]
train_df = train_df.drop(train_df.columns[[0, 2, 3]], axis=1) 

In [9]:
train_df.head()

Unnamed: 0,keywords,class
0,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62F
1,restaurant:1;marrakech.shtml:1,35M
2,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45F
3,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46F
4,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42F


In [10]:
train_df['keywords'][4]

'societe:1;disparition:1;proche:1;m%c3%a9lanie.gonidec:1;maelys:1;actualite:1;affich:1;repondre:1;douleurs:1;hypothyroidie:1;forum:1;profile:1;les:1;suspectent:1;articulaires:1;gendarmes:1;questions:1;marie:1;muscu:1'

In [11]:
# Convert dict to list of words without count 
def convert_to_words(keywords):
    list_of_words = []
    for x in keywords:
        list_of_words.append(x.split(":",1)[0])
    
    return list_of_words


# Convert dict to big string of words 
def convert_to_text(keywords):
    list_of_words = []
    for x in keywords:
        numb_iter = int(keywords[x])
        for k in range(numb_iter):
            list_of_words.append(x)
    
    return ' '.join(list_of_words)


def create_dict(keywords):
    return dict(x.split(':') for x in keywords)

In [12]:
x_df = train_df.copy()

In [13]:
#x_df['keywords'] = x_df['keywords'].apply(lambda row: dict(x.split(':') for x in row.split(";")))
#x_df['keywords'] = x_df['keywords'].apply(lambda row: convert_to_text(row))

x_df['keywords'] = x_df['keywords'].apply(lambda row: row.split(";"))
x_df['keywords'] = x_df['keywords'].apply(lambda row: convert_to_words(row))
x_df['keywords'] = x_df['keywords'].apply(lambda row: ' '.join(row))

In [14]:
x_df['keywords'][300]

'football actualites les pays france equipe direct demonstration video buts bas francaise'

In [15]:
X = x_df["keywords"]
y = x_df["class"]


X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df = 30)
# vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(X_train_df)
X_test = vectorizer.transform(X_test_df)


# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()

In [16]:
len(vectorizer.vocabulary_)

51395

In [17]:
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)

print("accuracy:   %0.3f" % score)

accuracy:   0.032


In [None]:
# Cross validate using k-fold
clf = MultinomialNB(alpha=.01)
y_pred = cross_val_predict(
    clf, X_train, y_train, cv=10, n_jobs=-1, verbose=20
)

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 405, in _handle_workers
    pool._maintain_pool()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 246, in _maintain_pool
    self._repopulate_pool()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
    w.start()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 105, in start
    self._popen = self._Popen(self)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
    

In [18]:
metrics.accuracy_score(y_test, pred)

0.032336967529048119

In [None]:
test_df2 = test_df.copy()

In [None]:
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: row.split(";"))
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: convert_to_words(row))
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: ' '.join(row))

In [None]:
test_df2['keywords'][2]

In [None]:
test = vectorizer.transform(test_df2['keywords'])

In [None]:
pred2 = clf.predict(test)

In [None]:
pred2[0]

In [None]:
test_df["prediction"] = pred2

In [None]:
test_df["sex"] = test_df["prediction"]

In [None]:
test_df.head()

In [None]:
test_df['sex'] = test_df['sex'].apply(lambda row: re.sub("\d+", "", row))
test_df['age'] = test_df['age'].apply(lambda row: re.sub("\D+", "", row))

test_df = test_df.drop(['prediction'] , axis=1)
test_df = test_df.drop(['keywords'] , axis=1)
test_df["ID"] = test_df.index

In [None]:
test_df.to_csv(f'{PATH}test1.csv', index=False)