## Project: Text classification with Rakuten France Product Data
### Part 1: Produce the tfidf matrices
Re-using the code provided in lab 2 we compute the tfidf matrix for each product designation.
The data is then saved to a pickle file (both the dataframe with designations pre-processed and the tfidf matrices). <b> Skip first part if the data is already saved in pickles and run the second part directly, loading the data from pickle <b>


In [34]:
# global imports
import pandas as pd 
import matplotlib.pyplot as plt
import pickle ## so that we do not have to run thetfidf all the time

%matplotlib inline

In [26]:
# read the data in 
X_train_df = pd.read_csv('./data/X_train_update.csv')
Y_train_df = pd.read_csv('./data/Y_train.csv') # note to self -> rename the file

# rename columns to be readable
X_train_df = X_train_df.rename(columns = {'Unnamed: 0':'id'})
Y_train_df = Y_train_df.rename(columns = {'Unnamed: 0':'id'})

#disregard columns that are not needed
X_train_df = X_train_df.filter(['id', 'designation'])

In [27]:
# display data
X_train_df.head()

Unnamed: 0,id,designation
0,0,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,3,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,4,La Guerre Des Tuques


In [28]:
#display data
Y_train_df.head()

Unnamed: 0,id,prdtypecode
0,0,10
1,1,2280
2,2,50
3,3,1280
4,4,2705


In [29]:
# preprocessing functions defined in lab 2

def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    
    return string



def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower()
    
    # Write code to normalize the accents
    string = normalize_accent(string)
        
    # Write code to tokenize
    spacy_tokens = spacy_nlp(string)
        
    # Write code to remove punctuation tokens and create string tokens
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct if not token.is_stop]
    
    # Write code to join the tokens back into a single string
    clean_string = " ".join(string_tokens)
    
    return clean_string



In [30]:
import spacy

spacy_nlp = spacy.load('fr')

# clean data
X_train_df['designation'] = X_train_df.designation.apply(lambda row: raw_to_tokens(row, spacy_nlp))

In [36]:
# compute the tfidf matrix 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.designation)

In [37]:
# save dataframe to pickle
X_train_df.to_pickle('X_train_df.pickle')

# save tfidf to pickle 
pickle.dump(X_tfidf, open('X_tfidf.pickle','wb'))

In [41]:
# check final data format
X_train_df.head()

Unnamed: 0,id,designation
0,0,olivia personalisiertes notizbuch 150 seiten p...
1,1,journal arts n° 133 28/09/2001 art marche salo...
2,2,grand stylet ergonomique bleu gamepad nintendo...
3,3,peluche donald europe disneyland 2000 marionne...
4,4,guerre tuques


### Part 2: Model testing and comparisons
If only starting here load data from pickle i.e. set load to true

In [42]:
# global imports
import pandas as pd 
import matplotlib.pyplot as plt
import pickle 

%matplotlib inline

load_pickle_data = False
if load_pickle_data:
    X_train_df = pd.read_pickle('X_train_df.pickle')
    X_tfidf = pickle.load(X_tfidf, open('X_tfidf.pickle','rb'))

In [43]:
## Split data to do cross validation
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X_tfidf,
                                                    Y_train_df.prdtypecode, 
                                                   test_size = 0.2, 
                                                   random_state = 42)

# check shapes
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(67932, 83370)
(67932,)
(16984, 83370)
(16984,)


In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import  f1_score, confusion_matrix
    
# Grab all models
models = [RandomForestClassifier(max_depth = 75)] #, 
              #AdaBoostClassifier(random_state=42, learning_rate= 0.1), 
          #BaggingClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier()] 
model_names = ['RandomForest']#, 'AdaBoost', 'Bagging', 'Gradient_boosting', 'Decision_tree'] 
f1_scores = list()

for idx, model in enumerate(models):
    print('Running model: ', model_names[idx])
    
    # fit and predict
    clf = model
    clf.fit(X_train, y_train)
    test_pred = clf.predict(X_test)
    
    f1_scores.append(f1_score(y_test, test_pred, average='weighted'))
    print(f1_scores[idx])
    print(confusion_matrix(y_test,test_pred))

d = {'Modelling Algo':model_names,'f1_score':f1_scores} 

Running model:  RandomForest
0.6388235838120959
[[ 120    1    0    0    1    1    0    2    0    6    0    0    0    2
     2    0    4    1  154   47    0    4    0  251    0   16    0]
 [  29  182   24    0    5    7    0    0    2    2    0    0    0    0
     0    0    3    0   27    6   15    5    1  205    0    6    2]
 [   2    4  182    7    7    5    0    1    0   16    0    7    1    6
     1    0    4    0    3    2   14    4    0   91    0    0    0]
 [   0    2    8  131    0    0    0    0    0    1    0    0    0    0
     0    0    0    0    0    0   10    0    0    9    0    0    0]
 [  10    8    0    0  302   15    2    7    0    6    0    1    5    2
     0    0    2    1   27   16    2    4    0  129    0    0    0]
 [   3    1    0    0    3  638    0    0    1    3    0    0    0    0
     0    0    0    0   18    4    0    1    0  113    0    1    0]
 [   9    1    0    0   11    4   35    6    2    1    1    0    0    3
     0    0    0    0    8    2    0    