In [3]:
import pickle
import os
import pandas as pd
import numpy as np
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
df = pd.read_csv('./classification.csv')
df.shape

In [None]:
#create a new dataframe with two column
df1 = df[['parent','text']].copy()
#remove missing values (NaN)
df1 = df1.[pd.notnull(df1['text'])]

#renaming second column for a simpler name
df1.columns = ['parent','text']

df1.shape

In [None]:
#percentage of complaints with text
total = df1['text'].notnull().sum()


In [None]:
pd.DataFrame(df.parent.unique()).values

In [None]:
#because the computation is time consuming (in terms of CPU), the data was sampled
df2 = df1.sample(6000,random_state=1).copy()


In [None]:
pd.DataFrame(df2['parent'].value_counts())


In [None]:
df2['category_id'] = df2['parent'].factorize()[0]
category_id_df = df2[['parent', 'category_id']].drop_duplicate()


#Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id','parent']].values())

#New dataframe

df2.head()

In [None]:
tfidf = TfidfVectorizer(sublinear_tf = True,min_df =5,
                       ngram_range = (1,2),
                       stop_words = 'english')

#we transform each complaint into a vector

features = tfidf.fit_transform(df2.text).toarray()
pickle.dump(features,open('tfidf1.pkl','wb'))
labels = df2.category_id

print("Each of the %d features (TF-IDF score of unigram and bigram)"%(features.shape))


In [None]:
#Finding the three most correlated terms with each of the product categories 
N=3
for parent , category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features,label ==category_id)
    indices = np.argsort(features_chi2[0])
    features_names = np.array(tfidf.get_features_name())[indices]
    unigrams = [v for v in feature_name if len(v.split(' ')) ==1]
    bigrams = [v for v in feature_name if len(v.split(' '))==2]
    print("\n==> %s:" %(parent))
    print(" * Most correlated unigrams are: %s",%(', '.join(unigrams[-N:])))
    print(" * Most correlated unigrams are: %s",%(', '.join(bigrams[-N:])))
    
    
    
    

In [None]:
models = [
    RandomForestClassifier(n_estimators =100,max_depth =5,random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)
]

# 5 cross validation
CV=5
CV_df = pd.DataFrame(index = range(CV*len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model,features,labels,scoring = 'accuracy',cv=CV)
    for fold_idx,accuracy in enumerate(accuracies):
        entries.append(model_name,fold_idx,accuracy)
        
cv_df = pd.DataFrame(entries,columns = ['model_name','fold_idx','accuracy'])

In [None]:
mean_accuracy = cv.df.groupby('model_name').accuracy.mean()
std_accuracy = cv.df.groupby('model_name').accuracy.std()
acc = pd.concat(['Mean Accuracy','Standard Deviation'])
acc


In [None]:
# logistic regression

X_train,X_test,y_train,y_test = train_test_split(features,labels,df2.index,test_size =0.30,
                                                random_state=1)
model = LogisticRegression()
model.fit(X_train,y_train)

from sklearn.extrenals import joblib\
joblib.dump(tfidf, "./exports/tfidfvectorizerlogisticregression.pkl")
joblib.dump(model, "./exports/classifierlogisticregression.pkl")


In [None]:
print('\t\t\t\t\t\tCLASSIFICATION METRICS\n')
print(metrics.classification_report(y_test,y_pred,target_names = df2['parent'].unique()))

In [None]:
conf_mat = confusion_matrix(y_test,y_pred)
fig,ax = plt.subplots(figsize =(8,8))
sns.heatmap(conf_mat,annot=True,cmap='Blues',fmt = 'd',
           xticklabels = category_id_df.parent.values,
           yticklabels = category_id_df.parent.values)

plt.ylabel('Actual')
plyt.xlabel('Predicted')
plt.title('Confusion matrix  -LinearSVC\n',size = 16)