<a href="https://colab.research.google.com/github/amittal-tcd/TCD-work/blob/master/Text%20Analytics/Step2%20-%20Model%20Improvement%20and%20Feature%20Importances/Text_Analytics_Group_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Libraries
Need to run only once in a session


In [0]:
!pip install catboost
!pip install shap

## Import libraries
And download corpus which may only be done once in a session

In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from catboost import CatBoostRegressor, Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
import matplotlib.pyplot as plt
import scipy
from sklearn import metrics
import seaborn as sns
dest = ''
import shap
from difflib import get_close_matches
from sklearn.decomposition import PCA
import io
import requests
import warnings
warnings.filterwarnings('ignore')
np.random.seed(100)

nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')

## Download Data from GIT and Keep Required Fields Only

In [0]:
url = "https://github.com/amittal-tcd/TCD-work/raw/master/Text%20Analytics/Step2%20-%20Model%20Improvement%20and%20Feature%20Importances/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv"
df = pd.read_csv(url)

df2 = df[['reviews.rating','reviews.text','reviews.title','reviews.date']]

s = np.random.uniform(high = df2.shape[0]-1, low = 0, size = round(0.2*df2.shape[0]))
s = np.unique(s.round())

dfv2 = df2.iloc[s,:] ## traing data
dft2 = df2.drop(index = s) ## validation data

print(df2.shape,dft2.shape,dfv2.shape)
dft2.head()

## Cleaning text
- Removing punctuations and stop words
- Experimentally removing words containing numbers
- Removing non-english words
- Stemming words so that there aren't similar words

In [0]:
stop_words = set(stopwords.words('english'))
words = set(nltk.corpus.words.words())
ps = nltk.stem.PorterStemmer()

tokenizer = RegexpTokenizer(r'\w+')
dft2['reviews.text'] = dft2['reviews.text'].map(lambda x: ' '.join([w.lower() for w in tokenizer.tokenize(x) if (not (w in stop_words)) and (w in words) and (any(char.isdigit() for char in w) == False)]))
dft2.head()

# ps.stem(w)
# if (not (w in stop_words)) and (w in words) and (any(char.isdigit() for char in w) == False)

## Extracting Features

In [0]:
# pca = PCA(n_components = 50)

vectorizer = CountVectorizer(analyzer='word',ngram_range=(1, 1))
# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,1),strip_accents = 'ascii')
X = vectorizer.fit_transform(dft2['reviews.text'])
X2 = vectorizer.transform(dfv2['reviews.text'])

dft3 = pd.DataFrame(X.toarray())
dfv3 = pd.DataFrame(X2.toarray())

# dft3 = pd.DataFrame(pca.fit_transform(X.toarray()))
# dfv3 = pd.DataFrame(pca.fit_transform(X2.toarray()))

dft3.columns = vectorizer.get_feature_names()
dfv3.columns = vectorizer.get_feature_names()

dft3.head()

## Creating flag which is 1 if the date of review is on a weekday and 0 if it is on a weekend

In [0]:
dft3['TimeCycle'] = np.where(pd.to_datetime(dft2['reviews.date'].str[:10], format='%Y-%m-%d').dt.dayofweek < 5,1,0)
dfv3['TimeCycle'] = np.where(pd.to_datetime(dfv2['reviews.date'].str[:10], format='%Y-%m-%d').dt.dayofweek < 5,1,0)
# df3['TimeCycle'] =  pd.to_datetime(df2['reviews.date'].str[:10], format='%Y-%m-%d').dt.day
# df3['TimeCycle'] = df2['reviews.rating']
dft3.head()

## Train test validation split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(dft3.drop(columns = ['TimeCycle']), dft3['TimeCycle'], test_size=0.20, random_state=1)

## Model Training

Change parameters for fine-tuning model

In [0]:
train_pool = Pool(data=X_train, label=y_train)
test_pool = Pool(data=X_test, label=y_test.values)

model = CatBoostClassifier(
    iterations=5000,
    learning_rate=0.1,
    random_strength=0.1,
    depth=6,
    metric_period = 250,
    eval_metric='AUC',
    task_type = "GPU",
    devices = '0:1'
)
                                       
model.fit(train_pool,plot=True,eval_set=test_pool)

## Grid Search for Model Improvement (Experimental - Last step for final model)

Best 'params': {'depth': 6, 'iterations': 5000, 'learning_rate': 0.1} 

In [0]:
model = CatBoostClassifier(metric_period = 100,
                            eval_metric='AUC',
                            task_type = "GPU",
                            devices = '0:1')
    
grid = {'learning_rate': [0.05, 0.1, 0.01],
        'depth': [4, 6, 10],
        'iterations': [2000,5000,1000]}

randomized_search_result = model.randomized_search(grid, 
                                       X = dft3.drop(columns = ['TimeCycle']),
                                       y = dft3['TimeCycle'], 
                                       plot=True)

print(randomized_search_result)

## Making Prediction and calculating accuracy

In [0]:
result = model.predict(dfv3.drop(columns = 'TimeCycle')).round(0)
final_accuracy = acc(dfv3['TimeCycle'],result)*100
final_accuracy

## AUC and Confusion Matrix

In [0]:
fig = plt.figure()
y_pred_proba = model.predict_proba(dfv3.drop(columns = 'TimeCycle'))[::,1]
fpr, tpr, _ = metrics.roc_curve(dfv3['TimeCycle'],y_pred_proba)
auc = metrics.roc_auc_score(dfv3['TimeCycle'], y_pred_proba)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.show()
fig.savefig(dest+'ROC.png', dpi=fig.dpi)

cm = metrics.confusion_matrix(dfv3['TimeCycle'],result)
# labels = ['No Default', 'Default']
fig = plt.figure(figsize=(8,6))
sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()
fig.savefig(dest+'Confusion Matrix.png', dpi=fig.dpi)

## Plotting shap plots to check positive or negetive quantities of effect of the individual features for the final prediction

### 1. Creating shap_values from validation data predictions

In [0]:
shap_values = model.get_feature_importance(Pool(dfv3.drop(columns = ['TimeCycle']), label=dfv3['TimeCycle']),type="ShapValues")
expected_value = shap_values[0,-1] ## Prediction Values
shap_values = shap_values[:,:-1] ## Force Values for each feature. Therefore, excluding the prediction column

### 2. Creating Summary of SHAP Values for Broad understanding of Impact of top Features

In [0]:
shap.summary_plot(shap_values, dfv3.drop(columns = ['TimeCycle']), max_display = 100) ## Change max_display value to add more features to the summary plot

### 3. Lets look at a single prediction to verify results from summary plot

#### First we draw a force plot

In [0]:
row_number = 16  ## Change to look a specific row of validation data. Can also give a range for clustered force plot.

shap.initjs()
shap.force_plot(expected_value,shap_values[row_number,:],dfv3.drop(columns = ['TimeCycle']).iloc[row_number,:])

#### Now, decision plot of the same prediction

In [0]:
shap.decision_plot(expected_value,shap_values[row_number,:],dfv3.drop(columns = ['TimeCycle']).iloc[row_number,:], feature_names = list(dfv3.columns[:-1]))

## Looking at Overall Model Feature Importances

Another way to look at importances apart from SHAP values. PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics (the value is determined automatically)

In [0]:
df5 = pd.DataFrame(data = {'Features': dft3.drop(columns = 'TimeCycle').columns, 'Importances':model.get_feature_importance()}).sort_values(by = 'Importances', ascending = False) ## Type in get importance by default is "FeatureImportance". Look at documentation for details.
df5.to_csv(dest+'Importances.csv') ## Saved in session runtime. Can be seen in "Files" section in the margin on the left
df5.head(100)

## Point-Biscerial Correlations
Another way to look relation between continuous features and binary target

In [0]:
from scipy import stats

l = []
for i in dft3.drop(columns = 'TimeCycle'):
    l2 = list(stats.pointbiserialr(dft3[i], dft3["TimeCycle"]))
    l2.append(i)
    l.append(l2)

df_corr = pd.DataFrame(l)
df_corr.columns = ['Correlation', 'p.Value', 'Column']
df_corr = df_corr.set_index('Column')
df_corr = df_corr.sort_values('p.Value')
df_corr[df_corr['p.Value'] <= 0.05].head()

### Lets look at the histogram of correlations to understand strength and direction of correlations of all significantly correlated features

In [0]:
df_corr2 = df_corr[df_corr['p.Value'] <= 0.05]
fig = plt.figure()
plt.hist(df_corr2['Correlation'], bins=50, color = 'g')
plt.xlabel('Correlation Value')
plt.ylabel('Density')
plt.show()
fig.savefig(dest+'fig5.png', dpi=fig.dpi)
print("Number of significantly correlated features = ",df_corr2.shape[0])

df_corr2['Correlation2'] = abs(df_corr2['Correlation'])
df_corr3 = df_corr2.sort_values('Correlation2', ascending = False).drop(columns = 'Correlation2')

print("\nTop 100 significantly correlated features by correlation coefficient values\n")
df_corr3.head(100)