# Import packages

In [26]:
import pandas as pd 
import numpy as np
import warnings

import re
import math
import statistics #stdev

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [27]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

# Read files

In [11]:
df = pd.read_csv('C:/achraf/Desktop/codes et commandes/git_wwsssuuup/IMDB Dataset.csv')
df = df.drop_duplicates()
df.rename(columns={'sentiment' : 'label'}, inplace = True) # rename column
df = df.replace({'negative': 1, 'positive': 0})
df.sample(1)

Unnamed: 0,review,label
42260,"Hargh... this film is so bad it's almost good. Trash at its best. Jesus' bro vs. pimps...come on. I'd say that you'd actually have to see this, it's so bad... my sides hurt when I laughed. I can't understand why this isn't in the worst 100.",1


In [12]:
df = df.sample(10000) # Take a subset for faster code test
test = df.sample(20)

In [13]:
X = pd.DataFrame(df['review'])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Define functions

In [14]:
# Définition du tokenizer
def tokenizer(review):
    liste_tokens = [] #liste de tokens à retourne
    tokens = re.split('[/ : _ . -]', review) # split de l'review et stockage dans liste intérmédiaire
    for i in tokens: # boucler sur les tokens pour supprimer les vides
        if i != '':
            liste_tokens.append(i) # il faut mettre dans la nouvelle liste pour être sur que l'élément n'y est pas
    return liste_tokens

In [15]:
def feature_crafting(df):
    '''
    Each function is preceded by a comment explaining its aim. Between each two different functions line jumps (breaks).
    '''    
    for i, row in df.iterrows():
        # Calculate average word length
        words = list(filter(None, re.split(r"\d*\W+", row['review'])))
        df.loc[i, 'avg_word_length'] = sum(len(word) for word in words) / len(words)
        
        #Letter Count
        df.loc[i, 'letter_count'] = sum(c.isalpha() for c in row['review'])
        
        # number of special characters
        df.loc[i, 'special_characters_number'] = len(re.sub('[A-Za-z0-9\s]+', '', row['review']
                                                            .replace('/', '').replace('.', '')))
        
        # Compute entropy
        prob = [float(row['review'].count(c)) / len(row['review']) for c in dict.fromkeys(list(row['review']))]
        df.loc[i, 'entropy'] = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
    return df


def get_review(df):
    return df['review']

def get_numerical_features(df):
    return df[['avg_word_length', 'letter_count', 'special_characters_number', 'entropy']]

## Transformers & pipelines

In [16]:
# Transformers
craft_features = FunctionTransformer(feature_crafting, validate=False)
get_text_data = FunctionTransformer(get_review, validate=False)
get_numeric_data = FunctionTransformer(get_numerical_features, validate=False)
# Pipelines
numeric_pipeline = Pipeline([('selector', get_numeric_data)])
text_pipeline = Pipeline([('selector', get_text_data), ('vectorizer', CountVectorizer(tokenizer = tokenizer, 
                                                                                      lowercase = False, 
                                                                                      max_df = 1.0,
                                                                                      ngram_range = (1, 2)))])

featureunionvect = FeatureUnion([('numeric', numeric_pipeline), ('text', text_pipeline)])

## Pipeline Logistic Regression

In [17]:
lr_pipeline = Pipeline([('numerical_features', craft_features),
                        ('vect', featureunionvect),
                        ('lr', LogisticRegression(max_iter = 7600, random_state=26, solver='lbfgs'))
                       ])

lr_pipe = lr_pipeline.fit(X_train, y_train)

## Pipe Xgboost

In [18]:
xgb_pipeline = Pipeline([('vect', featureunionvect),
                         ('clf', xgb.XGBClassifier(learning_rate=0.1))])

xgb_pipe = xgb_pipeline.fit(X_train, y_train)

In [23]:
test = X_test

lr_proba = lr_pipe.predict_proba(test)
xgb_proba = xgb_pipe.predict_proba(test)

test['xgb_proba_positive'] = [item[0] for item in xgb_proba]
test['xgb_proba_negative'] = [item[1] for item in xgb_proba]

test['lr_proba_positive'] = [item[0] for item in lr_proba]
test['lr_proba_negative'] = [item[1] for item in lr_proba]

test[test.columns.difference(['review'])].head(2)

Unnamed: 0,avg_word_length,entropy,letter_count,lr_proba_negative,lr_proba_positive,special_characters_number,xgb_proba_negative,xgb_proba_positive
46452,4.179842,-4.459055,2115.0,0.99947,0.00053,85.0,0.395418,0.604582
31128,4.234818,-4.458602,1046.0,0.999788,0.000212,38.0,0.927792,0.072208


<br><br><br><br>
#### Save model

In [24]:
# Code to SAVE-LOAD using pickle #
import pickle

# save
with open('model_pipeline_060621.pkl','wb') as f:
    pickle.dump(lr_pipe, f)

# load
with open('model_pipeline_060621.pkl', 'rb') as f:
    clf_pipe = pickle.load(f)

#### Test predict on test set

In [25]:
lr_proba = clf_pipe.predict_proba(test)

test['lr_proba_positive'] = [item[0] for item in lr_proba]
test['lr_proba_negative'] = [item[1] for item in lr_proba]

test[test.columns.difference(['review'])].head(2)

Unnamed: 0,avg_word_length,entropy,letter_count,lr_proba_negative,lr_proba_positive,special_characters_number,xgb_proba_negative,xgb_proba_positive
46452,4.179842,-4.459055,2115.0,0.99947,0.00053,85.0,0.395418,0.604582
31128,4.234818,-4.458602,1046.0,0.999788,0.000212,38.0,0.927792,0.072208
