# **Data Preprocessing**



In [237]:
# Common
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

# Regression
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import f as f_test

In [238]:
clothes = pd.read_csv('/content/clothes.csv')
clothes.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [239]:
clothes.dropna(inplace=True)
clothes.reset_index(drop=True, inplace=True)
clothes.head(5)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
1,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
2,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
3,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
4,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits


# **Splitting Data (Question 1)**

In [209]:
X = clothes[['Age', 'Rating', 'Department Name', 'Class Name']]
y = clothes['Recommended IND']

In [210]:
ct = ColumnTransformer(
    [('one-hot-encoder', OneHotEncoder(drop='first'), ['Department Name', 'Class Name'])],
    remainder='passthrough'
)

In [211]:
X_transform = ct.fit_transform(X)

# **Training with Classification Models (Question 1)**

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, test_size=0.2, random_state=42)

In [213]:
lst = ['Logistic Regression', 'kNN', 'SVC', 'Random Forest']
for i in lst:
  if i == 'Logistic Regression':
    model = LogisticRegression()
  elif i == 'kNN':
    model = KNeighborsClassifier()
  elif i == 'SVC':
    model = SVC()
  elif i == 'Random Forest':
    model = RandomForestClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print("\n{}".format(classification_report(y_test, y_pred)))
  print("Confusion Matrix of {}:\n".format(i), confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



              precision    recall  f1-score   support

           0       0.80      0.92      0.86       700
           1       0.98      0.95      0.97      3233

    accuracy                           0.94      3933
   macro avg       0.89      0.94      0.91      3933
weighted avg       0.95      0.94      0.95      3933

Confusion Matrix of Logistic Regression:
 [[ 647   53]
 [ 166 3067]]

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       700
           1       0.96      0.97      0.96      3233

    accuracy                           0.94      3933
   macro avg       0.90      0.89      0.89      3933
weighted avg       0.94      0.94      0.94      3933

Confusion Matrix of kNN:
 [[ 566  134]
 [ 111 3122]]

              precision    recall  f1-score   support

           0       0.85      0.78      0.81       700
           1       0.95      0.97      0.96      3233

    accuracy                           0.94      3933
   

# **Splitting Data (Question 2)**

In [241]:
X = clothes[['Age', 'Rating', 'Department Name', 'Class Name']]
y = clothes['Positive Feedback Count']

In [242]:
categorical_features = ['Department Name', 'Class Name']
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_features = ['Age', 'Rating']
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [243]:
ct = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [244]:
model = Pipeline(steps = [
    ('preprocessor', ct),
    ('regressor', LinearRegression())
])

# **Training with Regression Model (Question 2)**

In [245]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [246]:
model.fit(X_train, y_train)

In [247]:
y_pred = model.predict(X_test)

In [248]:
mse = mean_squared_error(y_test, y_pred)
mse

30.078672093726237

In [249]:
dfn = X_train.shape[1]
dfd = X_train.shape[0] - X_train.shape[1]
f_statistic = mse / (mse / dfd)

In [252]:
alpha = 0.05
critical_f_value = f_test.ppf(1 - alpha, dfn, dfd)

In [255]:
if f_statistic > critical_f_value:
    print("The group of variables (Age, Rating, Department Name, and Class Name) has a strong connection on Positive Feedback Count.")
else:
    print("The group of variables (Age, Rating, Department Name, and Class Name) does not have a strong connection on Positive Feedback Count.")

The group of variables (Age, Rating, Department Name, and Class Name) has a strong connection on Positive Feedback Count.
