# Drop-Shipping Listings Classification Project - TF_IDF, Word2Vec, and Logistic Regression

In [1]:
# Importing needed packages 
import pandas as pd 
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# Handling DROP-SHIPPING data
file_path1 = 'Data Scraping/LightSabers_DS_Processed.csv'
file_path2 = 'Data Scraping/StripLights_DS_Processed.csv'
file_path3 = 'Data Scraping/PowerAdapters_DS_Processed.csv'
df_1 = pd.read_csv(file_path1, header=None)
df_2 = pd.read_csv(file_path2, header=None)
df_3 = pd.read_csv(file_path3, header=None)

# Create combined dataframe
ds_df = pd.concat([df_1, df_2, df_3])

# Set header
ds_df.columns = ['Text from listing']
ds_df['label'] = 0
print(ds_df)

                                    Text from listing  label
0   skip main content walmart depart servic search...      0
1   christma saber giveaway four zero off all sabe...      0
2   dure black novemb youv donat four zero zero ze...      0
3   v skip main content ebay home shop categori en...      0
4   skip content shop our holiday gift guid to fin...      0
..                                                ...    ...
94  jameco electron product search enter product w...      0
95  jameco electron product search enter product w...      0
96  jameco electron product search enter product w...      0
97  jameco electron product search enter product w...      0
98  jameco electron product search enter product w...      0

[300 rows x 2 columns]


In [3]:
# Handling NON DROP-SHIPPING data
# Function to read and process a text file
def read_and_process_file(filename):
    with open(filename, 'r', encoding = 'utf-8') as file:
        data = file.read().split('\n')  # Splitting by line break
    return data

# File names
file_names = ['Data Scraping/lightsaber_non_DS.txt', 'Data Scraping/poweradapters_non_DS.txt', 'Data Scraping/striplights_non_DS.txt']

# Reading and processing all files
all_data = []
for file in file_names:
    data = read_and_process_file(file)
    all_data.extend(data)

# Converting to a DataFrame
non_ds_df = pd.DataFrame(all_data, columns=['Text from listing'])
non_ds_df['label'] = 1


In [4]:
# COMBINING DFs
combined_df = pd.concat([ds_df, non_ds_df])


# Shuffling combined DF in order to aid randomness
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(combined_df)

                                      Text from listing  label
0     aliexpress aliexpresscom onlin shop automot ph...      1
1     huawei supercharg cabl originaladapt huawei su...      1
2     one six color rgbretractabletwo one flash ligh...      1
3     skip content saberforg saber adapt saber part ...      0
4     dark saber lightsaberdark saber bladelightsab ...      1
...                                                 ...    ...
3012  aliexpress aliexpresscom onlin shop automot ph...      1
3013  aliexpress aliexpresscom onlin shop automot ph...      1
3014  usb c fast charger mobil phone charger mobil p...      1
3015  aliexpress aliexpresscom onlin shop automot ph...      1
3016  aliexpress aliexpresscom onlin shop automot ph...      1

[3017 rows x 2 columns]


In [282]:
# with open('combined_training_set.csv', 'w', encoding = 'utf-8') as csv_file:
#
#     header = ','.join(combined_df.columns) + '\n'
#     csv_file.write(header)
#
#
#     for index, row in combined_df.iterrows():
#         row_str = ','.join(map(str, row.values)) + '\n'
#         csv_file.write(row_str)

In [5]:
### Splitting dataframe into training, development and test:
# combined_df = pd.read_csv('combined_training_set.csv')
# combined_df = pd.read_csv('combined_training_set.csv')
# print(combined_df)
# Splitting the data into training (70%) and a temporary set (30%)
train_df, temp_df = train_test_split(combined_df, test_size=0.3, stratify=combined_df['label'], random_state=42)

# Splitting the temporary set into development and test sets (each 15% of the original data)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

y_train = train_df['label'].values
y_dev = dev_df['label'].values
y_test = test_df['label'].values

## TF-IDF 

In [7]:
# Exclude the label column and then combine the rest
# preprocessed_df['combined_text'] = preprocessed_df.drop('label', axis=1).apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Text from listing'])

In [8]:
# Transform the Development and Test Sets:
X_dev_tfidf = tfidf_vectorizer.transform(dev_df['Text from listing'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text from listing'])


### Logistic regression - Setup

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [10]:
model_LR = LogisticRegression()
model_LR.fit(X_train_tfidf, y_train)

# Using Stratified CV (TF-IDF)

In [61]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

number_of_folds = 3
skf = StratifiedKFold(n_splits=number_of_folds, random_state=42, shuffle=True)
model_LR2 = LogisticRegression(class_weight="balanced")
model_LR3 = LogisticRegression()
model_LR4 = LogisticRegression(C=0.5)


In [62]:
# Combining TRAINING and DEVELOPMENT sets to perform Stratified Cross-Validation
train_dev_df = pd.concat([train_df, dev_df])
X_train_dev = train_dev_df
y_train_dev = train_dev_df['label']

In [73]:
models = [model_LR2]
final_model = model_LR2
X_test_tfidf_2 = X_test_tfidf
for model in models: 
    accuracies = []
    precisions_DS = []
    precisions_non_DS = []
    recalls_DS= []
    recalls_non_DS= []
    f1_scores_DS = []
    f1_scores_non_DS = []

    for train_index, val_index in skf.split(X_train_dev, y_train_dev):

        # Splitting the data into train and validation for this fold
        X_train_fold, X_val_fold = X_train_dev.iloc[train_index], X_train_dev.iloc[val_index]
        y_train_fold, y_val_fold = y_train_dev.iloc[train_index], y_train_dev.iloc[val_index]

        # Applying TF-IDF transformation within each fold
        tfidf_vectorizer = TfidfVectorizer()
        X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_fold['Text from listing'])
        X_val_tfidf = tfidf_vectorizer.transform(X_val_fold['Text from listing'])
        X_test_tfidf_2 = tfidf_vectorizer.transform(test_df['Text from listing'])

        # Fit the model
        model.fit(X_train_tfidf, y_train_fold)
        final_model = model

        # Predict and evaluate on the validation data
        predictions = model.predict(X_val_tfidf)

        # Evaluate the model
        accuracy = accuracy_score(y_val_fold, predictions)
        precision = precision_score(y_val_fold, predictions, average=None)
        recall = recall_score(y_val_fold, predictions, average=None)
        f1 = f1_score(y_val_fold, predictions, average=None)

        accuracies.append(accuracy)
        precisions_DS.append(precision[0])
        precisions_non_DS.append(precision[1])
        recalls_DS.append(recall[0])
        recalls_non_DS.append(recall[1])
        f1_scores_DS.append(f1[0])
        f1_scores_non_DS.append(f1[1])
        
        #print(f"Confusion Matrix:\n{confusion_matrix(y_test_CV, predictions)}")

    average_accuracy = sum(accuracies) / len(accuracies)
    average_precision = (sum(precisions_DS) / len(precisions_DS), sum(precisions_non_DS) / len(precisions_non_DS))
    average_recall = (sum(recalls_DS) / len(recalls_DS), sum(recalls_non_DS) / len(recalls_non_DS))
    average_f1_score = (sum(f1_scores_DS) / len(f1_scores_DS), sum(f1_scores_non_DS) / len(f1_scores_non_DS))

    print(f"The evaluation metrics for model {model} are as follows:")
    print(f"Average Accuracy: {average_accuracy}")
    print(f"Average Precision. DS: {average_precision[0]}, Non DS: {average_precision[1]}")
    print(f"Average Recall. DS: {average_recall[0]}, Non DS: {average_recall[1]}")
    print(f"Average F1 Score. DS: {average_f1_score[0]}, Non DS: {average_f1_score[1]}")
    print()

The evaluation metrics for model LogisticRegression(class_weight='balanced') are as follows:
Average Accuracy: 0.9434510684726387
Average Precision. DS: 0.9461421946333392, Non DS: 0.9432072954678604
Average Recall. DS: 0.6502707382165465, Non DS: 0.9935946861374254
Average F1 Score. DS: 0.7707339901393001, Non DS: 0.9677433903005145



# Using Stratified CV (Word2Vec)

In [74]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import numpy as np


In [75]:
# Create helper function - Function to create averaged Word2Vec vectors
def get_mean_vector(word2vec_model, words):
    # Filter out words not in the vocabulary and compute mean vector
    return np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(word2vec_model.vector_size)], axis=0)


In [76]:
X_train_dev['tokenized_text'] = X_train_dev['Text from listing'].apply(word_tokenize)

In [None]:
models = [model_LR2, model_LR3, model_LR4]

for model in models: 
    accuracies = []
    precisions_DS = []
    precisions_non_DS = []
    recalls_DS= []
    recalls_non_DS= []
    f1_scores_DS = []
    f1_scores_non_DS = []

    for train_index, val_index in skf.split(X_train_dev, y_train_dev):

        # Splitting the data into train and validation for this fold
        X_train_fold, X_val_fold = X_train_dev.iloc[train_index], X_train_dev.iloc[val_index]
        y_train_fold, y_val_fold = y_train_dev.iloc[train_index], y_train_dev.iloc[val_index]

        # Train Word2Vec on the training part of the fold
        word2vec_model = Word2Vec(X_train_fold['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

        # Create Word2Vec feature vectors for training and validation data
        X_train_vectors = np.array([get_mean_vector(word2vec_model, words) for words in X_train_fold['tokenized_text']])
        X_val_vectors = np.array([get_mean_vector(word2vec_model, words) for words in X_val_fold['tokenized_text']])


        # Fit the model
        model.fit(X_train_vectors, y_train_fold)
        

        # Predict and evaluate on the validation data
        predictions = model.predict(X_val_vectors)

        # Evaluate the model
        accuracy = accuracy_score(y_val_fold, predictions)
        precision = precision_score(y_val_fold, predictions, average=None)
        recall = recall_score(y_val_fold, predictions, average=None)
        f1 = f1_score(y_val_fold, predictions, average=None)

        accuracies.append(accuracy)
        precisions_DS.append(precision[0])
        precisions_non_DS.append(precision[1])
        recalls_DS.append(recall[0])
        recalls_non_DS.append(recall[1])
        f1_scores_DS.append(f1[0])
        f1_scores_non_DS.append(f1[1])
        
        #print(f"Confusion Matrix:\n{confusion_matrix(y_test_CV, predictions)}")

    average_accuracy = sum(accuracies) / len(accuracies)
    average_precision = (sum(precisions_DS) / len(precisions_DS), sum(precisions_non_DS) / len(precisions_non_DS))
    average_recall = (sum(recalls_DS) / len(recalls_DS), sum(recalls_non_DS) / len(recalls_non_DS))
    average_f1_score = (sum(f1_scores_DS) / len(f1_scores_DS), sum(f1_scores_non_DS) / len(f1_scores_non_DS))

    print(f"The evaluation metrics for model {model} are as follows:")
    print(f"Average Accuracy: {average_accuracy}")
    print(f"Average Precision. DS: {average_precision[0]}, Non DS: {average_precision[1]}")
    print(f"Average Recall. DS: {average_recall[0]}, Non DS: {average_recall[1]}")
    print(f"Average F1 Score. DS: {average_f1_score[0]}, Non DS: {average_f1_score[1]}")
    print()

## Evaluating on unseen test data (W2V)

### W2V

In [126]:
# Tokenizing data 
X_train_dev['tokenized_text'] = X_train_dev['Text from listing'].apply(word_tokenize)
test_df['tokenized_text'] = test_df['Text from listing'].apply(word_tokenize)

# Train Word2Vec on the training part
word2vec_model = Word2Vec(X_train_dev['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

# Create Word2Vec feature vectors for training and validation data
X_train_vectors = np.array([get_mean_vector(word2vec_model, words) for words in X_train_dev['tokenized_text']])
X_test_vectors = np.array([get_mean_vector(word2vec_model, words) for words in test_df['tokenized_text']])

In [127]:
### DEFINING MODEL TO BE EVALUATED!!!
y_train_dev = train_dev_df['label'].values

# CHANGE THIS TO THE MODEL YOU WANT TO EVALUATE
final_model1  = LogisticRegression(class_weight="balanced")
#final_model1  = LogisticRegression()
# final_model1 = LogisticRegression(C=0.5)
final_model1.fit(X_train_vectors, y_train_dev)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [130]:
### RUN THIS CODE TO GET FINAL EVALUATION!!!
# Predict and evaluate on the test data
predictions = final_model1.predict(X_test_vectors)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average=None)
recall = recall_score(y_test, predictions, average=None)
f1 = f1_score(y_test, predictions, average=None)

print(f"The evaluation metrics for model {final_model1} are as follows:")
print(f"Accuracy: {accuracy: .3f}")
print(f"Precision. DS: {precision[0]: .3f}, Non DS: {precision[1]: .3f}")
print(f"Recall. DS: {recall[0]: .3f}, Non DS: {recall[1]: .3f}")
print(f"F1 Score. DS: {f1[0]: .3f}, Non DS: {f1[1]: .3f}")

The evaluation metrics for model LogisticRegression(class_weight='balanced') are as follows:
Accuracy: 0.957
Precision. DS: 0.945, Non DS: 0.974
Recall. DS: 0.712, Non DS: 0.996
F1 Score. DS: 0.809, Non DS: .985
