In [264]:
#importing our libraries
import pandas as pd
import numpy as np
import nltk
import math
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
stop = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from tensorflow.keras.layers import Activation, Dense,Embedding,Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Dense , Input , LSTM
from sklearn.impute import SimpleImputer 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [265]:
#improting our data sets for both training and test 
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

**Analyzing train dataset**

In [266]:
train_df.head()

Unnamed: 0,Id,Age,Review_Title,Review,Pos_Feedback_Cnt,Division,Department,Product_Category,Rating,Recommended
0,17274,34,Cute fall/holiday top,Love this top! the quality is magnificent and ...,1,General,Tops,Blouses,5,1
1,5921,35,,,0,General,Tops,Blouses,5,1
2,16479,40,Disappointed,"Sleeves were tight, was difficult to put on ?....",15,General,Tops,Blouses,2,0
3,1925,28,Gorgeous detailing,I never write reviews but this clothe is so fa...,3,General Petite,Clothes,Clothes,5,1
4,5691,39,Cute and comfortable tee!,Love this tshirt! casual but can be clotheed u...,0,General,Tops,Knits,5,1


In [267]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14091 entries, 0 to 14090
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                14091 non-null  int64 
 1   Age               14091 non-null  int64 
 2   Review_Title      11732 non-null  object
 3   Review            13588 non-null  object
 4   Pos_Feedback_Cnt  14091 non-null  int64 
 5   Division          14080 non-null  object
 6   Department        14080 non-null  object
 7   Product_Category  14080 non-null  object
 8   Rating            14091 non-null  int64 
 9   Recommended       14091 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 1.1+ MB


In [268]:
train_df.shape

(14091, 10)

In [269]:
train_df.describe()

Unnamed: 0,Id,Age,Pos_Feedback_Cnt,Rating,Recommended
count,14091.0,14091.0,14091.0,14091.0,14091.0
mean,11713.941736,43.093677,2.465474,4.197786,0.822511
std,6768.152493,12.176401,5.519936,1.109792,0.382096
min,0.0,18.0,0.0,1.0,0.0
25%,5860.0,34.0,0.0,4.0,1.0
50%,11693.0,41.0,1.0,5.0,1.0
75%,17555.5,51.0,3.0,5.0,1.0
max,23485.0,94.0,122.0,5.0,1.0


In [270]:
train_df.columns

Index(['Id', 'Age', 'Review_Title', 'Review', 'Pos_Feedback_Cnt', 'Division',
       'Department', 'Product_Category', 'Rating', 'Recommended'],
      dtype='object')

In [271]:
#drop out unnecessary columns from our final data frame
train_df=train_df.drop(['Id', 'Age', 'Division', 'Department', 'Product_Category','Pos_Feedback_Cnt'], axis = 1)

In [272]:
train_df

Unnamed: 0,Review_Title,Review,Rating,Recommended
0,Cute fall/holiday top,Love this top! the quality is magnificent and ...,5,1
1,,,5,1
2,Disappointed,"Sleeves were tight, was difficult to put on ?....",2,0
3,Gorgeous detailing,I never write reviews but this clothe is so fa...,5,1
4,Cute and comfortable tee!,Love this tshirt! casual but can be clotheed u...,5,1
...,...,...,...,...
14086,Too flowy,The pattern and fabric on this clothe are very...,3,0
14087,"Soft, snuggly and cute","Like the previous reviewer stated, it's more l...",5,1
14088,Gorgeous!,This sweater is so lovely.. i like the fact th...,5,1
14089,Really versatile!,"I just love this top, it has a flattering cut,...",5,1


In [273]:
#checking the missing values in Review and Review title columns
print(train_df['Review'].isnull().sum())
print(train_df['Review_Title'].isnull().sum())

503
2359


In [274]:
#handling with missing values in both review and review title column
imputer = SimpleImputer(strategy ='most_frequent')
data = imputer.fit_transform(train_df.iloc[:,1:].values)
train_df['Review']=data

In [275]:
imputer = SimpleImputer(strategy ='most_frequent')
data = imputer.fit_transform(train_df.iloc[:,0:].values)
train_df['Review_Title']=data

In [276]:
train_df['Review'].isnull().sum()

0

In [277]:
train_df['Review_Title'].isnull().sum()

0

In [278]:
#preprocessing the text data
#merging reviews and review titles
train_df['Reviews'] = train_df['Review']+' '+train_df['Review_Title']

In [279]:
#cleaning df_train['Reviews'] column

# lowercase reviews
train_df['clean_reviews'] = train_df['Reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# add a space before and after every punctuation 
train_df['clean_reviews'] = train_df['Reviews'].str.replace(r'([^\w\s]+)', ' \\1 ')

# remove punctuation
train_df['clean_reviews'] = train_df['Reviews'].str.replace('[^\w\s]','')

# remove stopwords
train_df['clean_reviews'] = train_df['Reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# remove digits
train_df['clean_reviews'] = train_df['Reviews'].str.replace('\d+', '')

# define corpus
words = set(nltk.corpus.words.words())

# remove non-corpus words
def remove_noncorpus(sentence):
    print(sentence)
    return " ".join(w for w in nltk.wordpunct_tokenize(sentence) if w.lower() in words or not w.isalpha())

train_df['clean_reviews'] = train_df['Reviews'].map(remove_noncorpus)

lemmatizer = WordNetLemmatizer()

#final train_df['Review'] column
train_df['Reviews'] = [lemmatizer.lemmatize(row) for row in train_df['clean_reviews']]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
I love this tee. it has a very flattering fit and the fabric is very soft. it runs small, especially in the arms. i suggest sizing up. the tee looks magnificent on. it is slimming and super comfortable. This tee is flattering and comfortable.
Really nice thick fabric, yet lays really nice. scarf included, love the poncho style. will get me throught the end of pregnancy and will look magnificent after as well! Thick and warm
The fit on these is starkly different than the "black hawk" version also being sold online. i could barely get my leg in those! i typically wear a 27-29 - a 28-29 in this brand of jeans - and the 29 fit magnificent. i have very large calves, so most "kick out" cop flares don't really provide the full effect for my body type, but these worked really well and i could definitely see the flare at the bottom. i am 5'8" and these hit right above my ankle, just as pictured online. 
----------
what made me Was

In [280]:
train_df

Unnamed: 0,Review_Title,Review,Rating,Recommended,Reviews,clean_reviews
0,Cute fall/holiday top,Love this top! the quality is magnificent and ...,5,1,Love this top ! the quality is magnificent and...,Love this top ! the quality is magnificent and...
1,Love it!,Excellent fit and i've gotten so many complime...,5,1,Excellent fit and i ' gotten so many . i buy a...,Excellent fit and i ' gotten so many . i buy a...
2,Disappointed,"Sleeves were tight, was difficult to put on ?....",2,0,"were tight , was difficult to put on ?. for th...","were tight , was difficult to put on ?. for th..."
3,Gorgeous detailing,I never write reviews but this clothe is so fa...,5,1,I never write but this clothe is so fantastic ...,I never write but this clothe is so fantastic ...
4,Cute and comfortable tee!,Love this tshirt! casual but can be clotheed u...,5,1,Love this ! casual but can be up with and a sc...,Love this ! casual but can be up with and a sc...
...,...,...,...,...,...,...
14086,Too flowy,The pattern and fabric on this clothe are very...,3,0,The pattern and fabric on this clothe are very...,The pattern and fabric on this clothe are very...
14087,"Soft, snuggly and cute","Like the previous reviewer stated, it's more l...",5,1,"Like the previous reviewer stated , it ' s mor...","Like the previous reviewer stated , it ' s mor..."
14088,Gorgeous!,This sweater is so lovely.. i like the fact th...,5,1,This sweater is so lovely .. i like the fact t...,This sweater is so lovely .. i like the fact t...
14089,Really versatile!,"I just love this top, it has a flattering cut,...",5,1,"I just love this top , it a flattering cut , f...","I just love this top , it a flattering cut , f..."


In [281]:
train_df=train_df[['Reviews', 'Rating', 'Recommended']]

In [282]:
train_df

Unnamed: 0,Reviews,Rating,Recommended
0,Love this top ! the quality is magnificent and...,5,1
1,Excellent fit and i ' gotten so many . i buy a...,5,1
2,"were tight , was difficult to put on ?. for th...",2,0
3,I never write but this clothe is so fantastic ...,5,1
4,Love this ! casual but can be up with and a sc...,5,1
...,...,...,...
14086,The pattern and fabric on this clothe are very...,3,0
14087,"Like the previous reviewer stated , it ' s mor...",5,1
14088,This sweater is so lovely .. i like the fact t...,5,1
14089,"I just love this top , it a flattering cut , f...",5,1


In [283]:
#taking Rating column 
y_rating=train_df[['Rating']]

In [284]:
y_rating

Unnamed: 0,Rating
0,5
1,5
2,2
3,5
4,5
...,...
14086,3
14087,5
14088,5
14089,5


After doing a bit research I found out that we can get the best result in review analysis with deep learning model. So I use this deep learning model for both rating and recommendation prediction

**Model for Predicting Rating using RNN and Bag of Words**

In [285]:
max_features = 40000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_df['Reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['Reviews'])

maxlen = 200
X_token= pad_sequences(list_tokenized_train, maxlen=maxlen)


y_1 = np.zeros((y_rating.shape[0], 5))
y_1[np.arange(y_rating.shape[0]), y_rating['Rating']-1] = 1
y_1 = pd.DataFrame(y_1,columns=['1','2','3','4','5'],dtype='int64')

In [286]:
batch_size = 100
epochs = 10
embed_size = 130
model_rating = Sequential()
model_rating.add(Embedding(max_features, embed_size))
model_rating.add(Bidirectional(LSTM(130, return_sequences = True)))
model_rating.add(Dense(20, activation="relu"))
model_rating.add(Dense(5, activation="softmax"))
model_rating.add(Dropout(0.05))
model_rating.add(GlobalMaxPool1D())
model_rating.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


model_rating.fit(X_token,y_1, batch_size=batch_size, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcb960fad50>

**Model for Predicting Recommendation**

In [287]:
max_features = 40000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_df['Reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['Reviews'])

maxlen = 170
X_token = pad_sequences(list_tokenized_train, maxlen=maxlen)
y_2 = train_df['Recommended']

batch_size = 100
epochs = 10
embed_size = 128
model_recommend = Sequential()
model_recommend.add(Embedding(max_features, embed_size))
model_recommend.add(Bidirectional(LSTM(32, return_sequences = True)))
model_recommend.add(GlobalMaxPool1D())
model_recommend.add(Dense(20, activation="relu"))
model_recommend.add(Dropout(0.05))
model_recommend.add(Dense(1, activation="sigmoid"))
model_recommend.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


model_recommend.fit(X_token,y_2, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcaddcd5310>

In [288]:
#id column for using later
id_df=test_df['Id']
#drop out unnecessary columns from our test data set
test_df=test_df.drop(['Id', 'Age', 'Division', 'Department', 'Product_Category','Pos_Feedback_Cnt'], axis = 1)

In [289]:
test_df

Unnamed: 0,Review_Title,Review
0,Magnificent clothe!,"In contrast to the other reviewer, i love this..."
1,Shapeless tent,I tried this on in the store and it was huge. ...
2,Versatile and then some,"I thought this was a fun piece to have, but di..."
3,So simple but so cute!,I bought the multi-color stripe and it is ador...
4,Magnificent simple tank,The wide strap style is very flattering. this ...
...,...,...
9390,Love this top!!!!,"This is a sweet, lightweight top that i will w..."
9391,Magnificent addition to my wardrobe!,"This is a magnificent piece, you can do many t..."
9392,Channel your inner joanna gaines....,"This anorak is excellention! excellent weight,..."
9393,Awkward fitting,"First, the fabric is beautiful and lovely for ..."


In [290]:
#checking the missing values in Review and Review titles
test_df['Review'].isnull().sum()

342

In [291]:
test_df['Review_Title'].isnull().sum()

1451

In [292]:
#handling with missing values in both review and review title column
imputer = SimpleImputer(strategy ='most_frequent')
data = imputer.fit_transform(test_df.iloc[:,1:].values)
test_df['Review']=data

In [293]:
test_df['Review'].isnull().sum()

0

In [294]:
imputer = SimpleImputer(strategy ='most_frequent')
data = imputer.fit_transform(test_df.iloc[:,0:].values)
test_df['Review_Title']=data

In [295]:
test_df['Review_Title'].isnull().sum()

0

In [296]:
#preprocessing the text data
test_df['Reviews'] = test_df['Review']+' '+test_df['Review_Title']

#cleaning df_test['Reviews'] column

# lowercase reviews
test_df['clean_reviews'] = test_df['Reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# add a space before and after every punctuation 
test_df['clean_reviews'] = test_df['Reviews'].str.replace(r'([^\w\s]+)', ' \\1 ')

# remove punctuation
test_df['clean_reviews'] = test_df['Reviews'].str.replace('[^\w\s]','')

# remove stopwords
test_df['clean_reviews'] = test_df['Reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# remove digits
test_df['clean_reviews'] = test_df['Reviews'].str.replace('\d+', '')

# define corpus
words = set(nltk.corpus.words.words())

# remove non-corpus words
def remove_noncorpus(sentence):
    print(sentence)
    return " ".join(w for w in nltk.wordpunct_tokenize(sentence) if w.lower() in words or not w.isalpha())

test_df['clean_reviews'] = test_df['Reviews'].map(remove_noncorpus)


test_df['Reviews'] = test_df['clean_reviews']

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Love wearing these, i see them getting a lot of wear in the winter So cozy
I thought this top would be a nice addition to my fall wardrobe, and while the color didn't disappoint, everything else did. it's a very loose slub so it's very see through and thin, i'd have to wear a cami under it. also the tie front has a very high slit! like it went almost halfway up the shirt and definitely would have shown skin unless you layered the shirt. i do think this top would be a cute throw on top for yoga/ gym wear, but definitely not at this price. pass for me sadly... Not for me... and not at this price
Yep, i used to have a nice body, but now have the dreaded menopause stomach... and some excess baggage. this clothe was very flattering. wore it today, easter sunday and had literally a dozen people seek me out to tell me how magnificent the clothe was or how nice i looked. it is a bright pink, think pepto bismol! heavy brocade type

In [297]:
#tokenization
list_tokenized_test = tokenizer.texts_to_sequences(test_df['Reviews'])
maxlen = 200
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [298]:
#predict the ratings
rating_prediction=model_rating.predict(X_test)

In [299]:
rating_prediction

array([[0.00400678, 0.01822011, 0.04842268, 0.30910134, 0.7071148 ],
       [0.13796268, 0.2712265 , 0.33492956, 0.34178144, 0.17170334],
       [0.01032463, 0.0224845 , 0.05633738, 0.49131596, 0.70707816],
       ...,
       [0.04214412, 0.1851437 , 0.333804  , 0.33615357, 0.5383806 ],
       [0.05936288, 0.7941727 , 0.33548555, 0.29314932, 0.17083244],
       [0.00444845, 0.01597195, 0.03321724, 0.38787782, 0.7361325 ]],
      dtype=float32)

In [300]:
#Returns the indices of the maximum values along an axis
rating_prediction=rating_prediction.argmax(axis=1)+1
rating_prediction

array([5, 4, 5, ..., 5, 2, 5])

In [301]:
#reshaping our dataframe
rating_prediction=rating_prediction.reshape(-1,1)

In [302]:
rating_prediction.shape

(9395, 1)

In [303]:
#predicting the recommendation
recommend_prediction=model_recommend.predict(X_test)

In [304]:
recommend_prediction

array([[0.9997156 ],
       [0.8490156 ],
       [0.9990539 ],
       ...,
       [0.9999391 ],
       [0.03686401],
       [0.9998796 ]], dtype=float32)

In [305]:
recommend_prediction=recommend_prediction.round(0)

In [306]:
recommend_prediction

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [1.]], dtype=float32)

In [307]:
recommend_prediction=recommend_prediction.astype(int)

In [308]:
recommend_prediction

array([[1],
       [1],
       [1],
       ...,
       [1],
       [0],
       [1]])

In [309]:
recommend_prediction.shape

(9395, 1)

In [310]:
result_df_recommend = pd.DataFrame(recommend_prediction, columns=['Recommended'])

In [311]:
result_df_recommend

Unnamed: 0,Recommended
0,1
1,1
2,1
3,1
4,1
...,...
9390,1
9391,1
9392,1
9393,0


In [312]:
result_df_rating = pd.DataFrame(rating_prediction, columns=['Rating'])

In [313]:
result_df_rating

Unnamed: 0,Rating
0,5
1,4
2,5
3,5
4,5
...,...
9390,5
9391,5
9392,5
9393,2


In [314]:
#submission file
final_result = pd.concat([id_df, result_df_rating, result_df_recommend], axis=1)

In [315]:
#converting to csv file
final_result.to_csv('ali_final_prediction.csv', index=False)

References:

https://numpy.org/devdocs/reference/generated/numpy.argmax.html

https://stackoverflow.com/questions/57333255/how-to-optimize-my-pandas-data-frame-pre-processing

https://towardsdatascience.com/clothes-reviews-analysis-with-nlp-part-1-d81bdfa14d97

https://towardsdatascience.com/clothes-reviews-analysis-with-nlp-part-1-bfb8a3a2c4bd