### Term Frequency - Inverse Document Frequency

In [2]:
#import serialized objects
import pandas as pd

corpus = pd.read_csv(filepath_or_buffer = 'cleaned_text.csv', index_col=[0])
other_data = pd.read_csv(filepath_or_buffer= 'other_data.csv', index_col=[0])

In [3]:
other_data.head()

Unnamed: 0,Age,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
2,4.094345,3,0,0.0,0,1,Dresses
3,3.912023,5,1,0.0,1,0,Pants
4,3.850148,5,1,2.44949,0,4,Blouses
5,3.89182,2,0,2.0,0,1,Dresses
6,3.663562,5,1,1.0,1,4,Knits


In [4]:
corpus.head()

Unnamed: 0,cleaned_text
2,"['I', 'high', 'hope', 'dress', 'really', 'want..."
3,"['I', 'love', 'love', 'love', 'jumpsuit', 'fun..."
4,"['This', 'shirt', 'flattering', 'due', 'adjust..."
5,"['I', 'love', 'tracy', 'reese', 'dress', 'one'..."
6,"['I', 'aded', 'basket', 'hte', 'last', 'mintue..."


In [5]:
#vectorized data using ngram
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
x_count = tfidf_vec.fit_transform(corpus['cleaned_text'])

print(x_count.shape)

(19662, 16983)


In [6]:
## save vectorized data to dataframe

x_tfidf_df = pd.DataFrame(x_count.toarray())
x_tfidf_df.columns = tfidf_vec.get_feature_names()
x_tfidf_df.head()

Unnamed: 0,00,000,002first,00p,00p0p,00p0rxxsxs,02,025,02xs,03,...,zipperone,zippie,zipping,zombie,zone,zoolanders,zoom,zooming,zuma,ã¼ber
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#merge dataframes

df = pd.merge(left= x_tfidf_df, right= other_data, left_index=True, right_index=True)
df.shape

(16462, 16990)

**Train Test Split**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['Class Name'].values.reshape(-1,1))
x = df.drop(labels='Class Name', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size= 0.8, random_state = 10 )

  return f(**kwargs)


**Modeling**

In [14]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [15]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(estimator = rf, X = x_train, y = y_train, cv = k_fold, scoring = 'accuracy', n_jobs = -1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.1min finished


array([0.60933941, 0.6055429 , 0.62604404, 0.61541382, 0.61488796])

**NB :**
Accuracy when using the TF-IDF vectorizer is lower than when using count vectorizer to create the document term matrix.

Accuracy of the model can be improved by

- Increasing the number of trees in the random forest algorithm
- Using a diffrent algorithm
- Processing the text data even more