In [1]:
import os.path
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk
!pip install scikit-learn
!pip install pandas
!pip install tqdm
!pip install numpy

You should consider upgrading via the '/Users/abbaslawal/workspace/cwt_case_study/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     |████████████████████████████████| 12.8 MB 9.4 MB/s            
You should consider upgrading via the '/Users/abbaslawal/workspace/cwt_case_study/venv/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
You should consider upgrading via the '/Users/abbaslawal/workspace/cwt_case_study/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/abbaslawal/workspace/cwt_case_study/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/abbaslawal/workspace/c

In [2]:
try:
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk import word_tokenize, FreqDist
except ImportError:
    nltk.download('omw-1.4')
    nltk.download('wordnet')
    nltk.download('stopwords')
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk import word_tokenize, FreqDist
   
import spacy
import re
import os
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [4]:
import warnings
warnings.filterwarnings('ignore')

from cwt_case_study.pre_processor import NltkPreProcessor

# Task 1

We have a ML model that takes as input a sentence and returns a label. Our Head of Data Science, defined the methods that should be performed to pre-process the input/text/sentence and wants you the developer to publish a library that that implements the following methods.

- Remove digits
- Lowercase
- Remove puncuations
- Remove stopwords
- Get the lemma for each word


This library will be re-used by our Data Scientists. So they might want to reuse the methods from the base model or implement new ones.

<b>Your task is the following:</b>
* refactor the code bellow and create a base class and other classes that will inherit and implement the functions defined bellow. 
* Make sure your base model should enforce those methods so if any data scientist wants to reuse your code, you must make sure that they implement those functions


Please note, in the code bellow you will find 3 sections
* General Methods
* NLTK Methods
* Spacy Methods

General methods are written using plain Python and we use NLTK and Spacy methods only for removing stopwords and lemmatising the words. Make sure you reflect those libraries as well in your implementation. Our Data Scientist will have the freedom to choose a pre-processor based on the library of their choise. 
For instance, if we choose to use Spacy, the methods remove stopwords and lematise should be implemented by that library specifically.

## General Methods

In [5]:
text = "Hello, how are you today? I hope you don't find this case study too dificult!"
# Remove digits
text = text.lower()
# Lowercase
text = re.sub(r'\d+', '', text)
# Remove puncuations
text =  re.sub(r'[^\w\s]', '', text)


In [6]:
text

'hello how are you today i hope you dont find this case study too dificult'

## NLTK Methods

In [7]:
# text = "Hello, how are you today? I hope you don't find this case study too dificult!"

# Remove stopwords
text = " ".join([w for w in text.split() if w not in set(stopwords.words("english"))])
# Get the lemma for each word
text = " ".join([WordNetLemmatizer().lemmatize(w) for w in text.split()])

text

'hello today hope dont find case study dificult'

## Spacy Methods


In [8]:
## General methods 
text = "Hello, how are you today? I hope you don't find this case study too dificult!"
# Remove digits
text = text.lower()
# Lowercase
text = re.sub(r'\d+', '', text)
# Remove puncuations
text =  re.sub(r'[^\w\s]', '', text)

In [9]:
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "textcat", "ner"])

text = "Hello, how are you today? I hope you don't find this case study too dificult!"
doc = nlp(text)


In [10]:
# Remove stopwords
doc = [token for token in doc if not token.is_stop]
# Get the lemma for each word
doc = [str(token.lemma_) for token in doc ]
doc = " ".join(doc)

In [11]:
doc

'hello , today ? hope find case study dificult !'

# Training the model

Bellow you can find the code we used to train a basic model. The only thing you can change here is the pre-proseesing methods used to clean the 'Review Text' column. You can use your own PreProcessor created from Task 1. 


In [12]:
pre_processor = NltkPreProcessor()

df1 = pd.read_csv(os.path.join('data', 'Womens Clothing E-Commerce Reviews.csv' ))
df = df1[['ReviewText','Rating','ClassName','Age']]
df['ReviewText'] = df['ReviewText'].fillna('')
df['ReviewText'] = df['ReviewText'].apply(pre_processor.lowercase)
df['ReviewText'] = df['ReviewText'].apply(pre_processor.remove_digits)
df['ReviewText'] = df['ReviewText'].apply(pre_processor.remove_punctuation)
df['ReviewText'] = df['ReviewText'].apply(pre_processor.remove_stop_words)
df['ReviewText'] = df['ReviewText'].apply(pre_processor.get_lemma)


# CountVectorizer() converts a collection 
# of text documents to a matrix of token counts
vectorizer = CountVectorizer()
# assign a shorter name for the analyze
# which tokenizes the string
analyzer = vectorizer.build_analyzer()

def wordcounts(s):
    c = {}
    # tokenize the string and continue, if it is not empty
    if analyzer(s):
        d = {}
        # find counts of the vocabularies and transform to array
        w = vectorizer.fit_transform([s]).toarray()
        # vocabulary and index (index of w)
        vc = vectorizer.vocabulary_
        # items() transforms the dictionary's (word, index) tuple pairs
        for k,v in vc.items():
            d[v]=k # d -> index:word
        for index,i in enumerate(w[0]):
            c[d[index]] = i # c -> word:count
    return  c

# add new column to the dataframe
df['WordCounts'] = df['ReviewText'].apply(wordcounts)
df.head()


Unnamed: 0,ReviewText,Rating,ClassName,Age,WordCounts
0,absolutely wonderful silky sexy comfortable,4,Intimates,33,"{'absolutely': 1, 'comfortable': 1, 'sexy': 1,..."
1,love dress sooo pretty happened find store im ...,5,Dresses,34,"{'bc': 2, 'bought': 1, 'definitely': 1, 'dress..."
2,high hope dress really wanted work initially o...,3,Dresses,60,"{'bottom': 1, 'cheap': 1, 'comfortable': 1, 'c..."
3,love love love jumpsuit fun flirty fabulous ev...,5,Pants,50,"{'compliment': 1, 'every': 1, 'fabulous': 1, '..."
4,shirt flattering due adjustable front tie perf...,5,Blouses,47,"{'adjustable': 1, 'cardigan': 1, 'due': 1, 'fl..."


In [13]:
# Rating of 4 or higher -> positive, while the ones with 
# Rating of 2 or lower -> negative 
# Rating of 3 -> neutral
df = df[df['Rating'] != 3]
df['Sentiment'] = df['Rating'] >=4
df.head()

# split data
train_data,test_data = train_test_split(df,train_size=0.8,random_state=0)
print(test_data['ReviewText'])

# select the columns and 
# prepare data for the models 
X_train = vectorizer.fit_transform(train_data['ReviewText'])
y_train = train_data['Sentiment']
X_test = vectorizer.transform(test_data['ReviewText'])
y_test = test_data['Sentiment']

261      top cute got lighter color fit great go many t...
6466     love color top dark olive green elegant mostly...
9853     like color design looked super cute tried brou...
20688    love tunic detail weight fringe run big would ...
10497    ordered dress online color pretty like pastel ...
                               ...                        
17765    love love love dressits quality well fitting j...
21250    several pair ag jean always pleased get saw st...
6214     dress well made fabric thick enough winter wea...
15821                                                     
10260            comfortable fit muscley flabby arm pretty
Name: ReviewText, Length: 4123, dtype: object


In [14]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [15]:
pred_lr = lr.predict_proba(X_test)[:,1]
print(list(pred_lr))
fpr_lr,tpr_lr,_ = roc_curve(y_test,pred_lr)
roc_auc_lr = auc(fpr_lr,tpr_lr)
print(roc_auc_lr)

[0.991353498275863, 0.9997123792447139, 0.19981989459426158, 0.9117085031376856, 0.6657515749602467, 0.022473180437923335, 0.9936812957433047, 0.9969413029850495, 0.9888728544867184, 0.9997073934607504, 0.0013011420521687681, 0.9924774280531372, 0.9999876179543494, 0.9994370977582222, 0.9883340512725692, 0.9999770740635612, 0.9998635360091221, 0.8773136659218499, 0.9615445819356562, 0.008541202226746045, 0.0028262497651549857, 0.9985397737522616, 0.9688093359753277, 0.9995499406438306, 0.9993065504922989, 0.9995258430940605, 0.9023588455643947, 0.9905481878295908, 0.9996406532387078, 0.9956254767298196, 0.9995424960016587, 0.995064837389222, 0.9996940742757816, 0.22336990919921826, 0.0001373400022268921, 0.9886627041602204, 0.004929144452507996, 0.9959706583724496, 0.9468224473075192, 0.990228485969285, 0.1881560060722602, 0.9998723583427845, 0.9993863758192048, 0.999779012776332, 0.9997306050920235, 0.96611104858593, 0.7192365409747054, 0.8826794631320003, 0.7681142263282339, 0.977772

# Task 2

In the Gridsearch function, we have an array of different values as the hyperparameter of our model training.
We also have a function called ```train_and_evaluate``` which takes as input one of those values from the array and returns the accuracy score of the model based on that value

We know the following insights for the array:
* One of those values would yield the highest accuracy score. 
* The values in the array are sorted
* We already know that the if we use the array's values from the order, the model's accuracy will start increasing up to a certain value and then it will start decreasing again
    * You can find an example in ```cell number 18``` where we run this snippet ```print(grid_search)```
* There exists some ```index``` with ```0 < i < arr.length - 1``` such that:
    * ```arr[0] < arr[1] < ... < arr[i - 1] < arr[i]```
    * ```arr[i] > arr[i + 1] > ... > arr[arr.length - 1]```

<b>Can you think of an algorithm that will identify this value without having to run the whole array and reduce the complexity of the current algorithm?</b>

* Please implement the following function bellow ]```find_the_best_c_value_optimised```.
* The output of your function should be the same as the the results of ``` find_the_best_c_value```, but with fewer ittirations

<b>Please remember you are tasked to improve the logic of looping through the array. Do not spend time improving the speed of the Logistic Regression model</b>

## Grid Search

In [16]:
import tqdm 
import numpy as np

def grid_search(X_train,y_train,X_test,y_test )-> pd.DataFrame:

    """
    """

    grid = {}
    C = [0.001,0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 5, 10, 15, 25, 50, 100, 1000]
    
    model_results = {
                     'C': [],
                    'roc':[]
                    }

    total_combinations = len(C)
    print(f"Total Models to grid search: {total_combinations}")

    with tqdm.tqdm(total=total_combinations)as pbar:

        # iterate through number of topics
        for c in C:

            lr = LogisticRegression(C=c)
            lr.fit(X_train,y_train)

            pred_lr = lr.predict_proba(X_test)[:,1]
            fpr_lr,tpr_lr,_ = roc_curve(y_test,pred_lr)
            roc_auc_lr = auc(fpr_lr,tpr_lr)

            # Save the model results
            model_results['C'].append(c)
            model_results['roc'].append(roc_auc_lr)
            pbar.update(1)
        
        grid_search_results = pd.DataFrame(model_results)
        pbar.close()
    
        return grid_search_results

In [17]:
grid_search = grid_search(X_train,y_train,X_test,y_test)

Total Models to grid search: 15


100%|██████████| 15/15 [00:10<00:00,  1.47it/s]


In [18]:
print(grid_search)

           C       roc
0      0.001  0.880384
1      0.005  0.909997
2      0.010  0.920672
3      0.050  0.936956
4      0.100  0.940044
5      0.250  0.941380
6      0.500  0.940255
7      1.000  0.937600
8      5.000  0.924823
9     10.000  0.917832
10    15.000  0.914111
11    25.000  0.909872
12    50.000  0.904416
13   100.000  0.900607
14  1000.000  0.893108


In [19]:
def train_and_evaluate(hyperparameter_value):
    c = hyperparameter_value
    lr = LogisticRegression(C=c)
    lr.fit(X_train,y_train)

    pred_lr = lr.predict_proba(X_test)[:,1]
    fpr_lr,tpr_lr,_ = roc_curve(y_test,pred_lr)
    roc_auc_lr = auc(fpr_lr,tpr_lr)

    return roc_auc_lr

def find_the_best_c_value(c_array):
    number_of_iterations = 0
    model_score = []
    
    for c in c_array:
        number_of_iterations +=1
        score = train_and_evaluate(c)
        model_score.append({'c':c, 'score':score})
        
    
    best_c_value = max(model_score, key=lambda x:x['score'])
    print(f"Best C values is: {best_c_value['c']} with score {best_c_value['score']}")
    print(f"Number of iterations: {number_of_iterations}")
    

In [20]:
example_array_1 = [0.001,0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 5, 10, 15, 25, 50, 100, 1000]
example_array_2 = [1,2,3,4,5,6,7,8,9,10]
example_array_3 = list(np.linspace(start=0.001, stop = 0.1, num=100 ))

In [21]:
find_the_best_c_value(example_array_1)
print('-------------------------------')
find_the_best_c_value(example_array_2)
print('-------------------------------')
find_the_best_c_value(example_array_3)
print('-------------------------------')

Best C values is: 0.25 with score 0.9413798650041405
Number of iterations: 15
-------------------------------
Best C values is: 1 with score 0.9376004575323043
Number of iterations: 10
-------------------------------
Best C values is: 0.1 with score 0.9400442042929399
Number of iterations: 100
-------------------------------


In [22]:
def find_the_best_c_value_optimised(c_array):
    number_of_iterations = 0
    previous_score = 0

    for index, c_value in enumerate(c_array):
        number_of_iterations +=1
        if index == 0:
            previous_score = train_and_evaluate(c_value)
        else:
            current_score = train_and_evaluate(c_value)
            if current_score < previous_score:
                print(f"Best C values is: {c_array[index - 1]} with score {previous_score}")
                print(f"Number of iterations: {number_of_iterations}")
                break

            previous_score = current_score


In [23]:
find_the_best_c_value_optimised(example_array_1)
print('-------------------------------')
find_the_best_c_value_optimised(example_array_2)
print('-------------------------------')
find_the_best_c_value_optimised(example_array_3)
print('-------------------------------')

Best C values is: 0.25 with score 0.9413798650041405
Number of iterations: 7
-------------------------------
Best C values is: 1 with score 0.9376004575323043
Number of iterations: 2
-------------------------------
Best C values is: 0.055 with score 0.9375956915708012
Number of iterations: 56
-------------------------------


### Export Model

In [24]:
import pickle

# save
with open('model.pkl','wb') as f:
    pickle.dump(lr,f)