In [None]:
import numpy as np 
import pandas as pd 
import sklearn
import os
import matplotlib.pyplot as plt
%matplotlib inline
import io
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.subplots
import plotly.figure_factory as ff

In [None]:
#read in file
parts = pd.read_csv("/Users/SDevitt/OneDrive - Sense Corp/Jupyter_notebooks/kobayashi_maru_december2019.csv")
parts.head()

In [None]:
#basic data summary, counts, looking for null etc
print ("Rows     : " ,parts.shape[0])
print ("Columns  : " ,parts.shape[1])
print ("\nFeatures : \n" ,parts.columns.tolist())
print ("\nMissing values :  ", parts.isnull().sum().values.sum())
print ("\nUnique values :  \n",parts.nunique())
print(parts.info())

In [None]:
#plot frequency of each label
plot = plt.figure(figsize=(8,6))
parts.groupby('Labels').Text.count().plot.bar(ylim=0)
plt.show()

no missing data, 155 unique labels, 10067 rows, 10067 unique inventory items, 4 cols

We can see our data is quite imbalanced with 25% of the data from a single category. Lets see what that category is.

In [None]:
parts['Labels'].value_counts()

So about a quarter of the dataset are misc. I think this may need to be a 2 stage classification (misc vs everything else and the a model for the remaining 154 classes), but will leave this for now and see how the model does. Another note, there are many classes with just one record in the training set. These are important, I will see later if I need to do anything to make the dataset more balanced.

Inv Item      10067
Labels          155
Text 

In [None]:
from io import StringIO
#add a numerical id for each label
parts['Label_id']=parts['Labels'].factorize()[0]
parts.head()

In [None]:
#create a dictionary w labels & label_id, I will use this later
df = parts[['Labels','Label_id']].drop_duplicates().sort_values('Label_id')

label_to_id = dict(df.values)
#print(label_to_id)
id_to_label = dict(df[['Label_id','Labels']].values)
#print(id_to_label)

In [None]:
from imblearn.over_sampling import RandomOverSampler
cols = ['Text', 'Inv Item']

#define X & Y
X = parts[cols]
Y = parts['Label_id']


ros = RandomOverSampler(random_state=0)
ros_X, ros_Y = ros.fit_resample(X, Y)

# using Counter to display results of naive oversampling
from collections import Counter
print(sorted(Counter(ros_Y).items()))

now each category is represented as many times as misc - 2795 times; note I tried running this after the TFIDF vectorization and couldn't ever get to run

In [None]:
#make a new data frame with our vectorized data and the Label_id
dfY = pd.DataFrame(ros_Y)
dfX = pd.DataFrame(ros_X, columns=cols)
df = dfY.merge(dfX,left_index=True,right_index=True,how = "left")
df.info()

now the table has 433224 rows

In [None]:
df.head()

lets use tf idf to vectorize our text col.
This is as far as I get. I can get this code to run fine on the original datatable but not on the RoS one.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Text).toarray()
labels = df.Label_id
features.shape

So, our text field was vectorized into 40737 features. We can use chi2 to see which features are most correlated with each class. Note, this runs fine on non resampled dataset

from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Labels, Label_id in sorted(label_to_id.items()):
  features_chi2 = chi2(features, labels == Label_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Labels))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

In [None]:
#make a new data frame with our vectorized data and the Label_id
#this runs fine on non resampled dataset
df = pd.DataFrame(features)
parts_og = parts.copy()
df = parts.merge(df,left_index=True,right_index=True,how = "left")
df.iloc[:10,:10]

In [None]:
df.shape

In [None]:
#this runs on non resampled data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(parts['Text'], parts['Labels'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
