In [2]:
import pandas as pd
import numpy as np 
import re
from collections import Counter
import itertools
from pprint import pprint
import json 
import requests
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from tqdm.auto import tqdm
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
raw_validation = pd.read_excel("validation_dataset.xlsx")#.loc[0:10000,:]
raw_train = pd.read_excel("training_dataset.xlsx")#.loc[0:10000,:]
raw_test = pd.read_excel("test_dataset.xlsx")#.loc[0:10000,:]
raw_train.head()

In [None]:
# Tokenize the text: split each string into words
words = raw_train.loc[0:5000, 'text'].str.split().tolist()

# Flatten the list of words
all_words = list(itertools.chain.from_iterable(words))

# Count the frequencies
word_counts = Counter(all_words)

# Get the most common words
most_common_words = word_counts.most_common()  # You can specify a number inside most_common(n)

# Display the most common words
pprint(most_common_words)

In [None]:

with open("RemovedWords.json", "w+") as file:
    json.dump(list(np.array(most_common_words)[:,0]), file)

In [None]:
with open("FormattedWords.json", "r+") as file:
    most_common_words = json.load(file)

In [None]:
train_df = raw_train.copy()
validation_df = raw_validation.copy()
test_df = raw_test.copy()


# List of words to remove
words_to_remove = list(np.array(most_common_words)[0:250,0])  # Add your words here

# Create a regular expression pattern
regex_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'

# Remove the words from the DataFrame column
train_df['text'] = raw_train['text'].str.strip().replace(regex_pattern, '', regex=True)
test_df['text'] = raw_test['text'].str.strip().replace(regex_pattern, '', regex=True)
validation_df['text'] = raw_validation['text'].str.strip().replace(regex_pattern, '', regex=True)
print("Removed Common Words!")

# Replacing repeating words and one letter words
test_df['text'] = test_df['text'].str.replace(r'\b(\w)(\W*\1)*\b', '', regex=True)
train_df['text'] = train_df['text'].str.replace(r'\b(\w)(\W*\1)*\b', '', regex=True)
validation_df['text'] = validation_df['text'].str.replace(r'\b(\w)(\W*\1)*\b', '', regex=True)
print("Removed Repeating words and one letter words")

# Remove sequences of spaces and all punctuation
train_df['text'] = train_df['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
train_df['text'] = train_df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove all punctuation except spaces and word characters

test_df['text'] = test_df['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
test_df['text'] = test_df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove all punctuation except spaces and word characters

validation_df['text'] = validation_df['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
validation_df['text'] = validation_df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove all punctuation except spaces and word characters

train_df['text']

In [None]:
tqdm.pandas()

# Create instances of lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    lem_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lem_words)

def stem_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    stem_words = [stemmer.stem(word) for word in words]
    return ' '.join(stem_words)

train_df['stemmed_text'] = train_df['text'].progress_apply(lemmatize_sentence)
train_df['lemmatized_text'] = train_df['text'].progress_apply(stem_sentence)

test_df['stemmed_text'] = test_df['text'].progress_apply(lemmatize_sentence)
test_df['lemmatized_text'] = test_df['text'].progress_apply(stem_sentence)

validation_df['stemmed_text'] = validation_df['text'].progress_apply(lemmatize_sentence)
validation_df['lemmatized_text'] = validation_df['text'].progress_apply(stem_sentence)

In [None]:
train_df.to_csv("TrainData.csv", index=False)
test_df.to_csv("TestData.csv", index=False)
validation_df.to_csv("ValidationData.csv", index=False)

In [3]:
train_df = pd.read_csv("TrainData.csv")
test_df = pd.read_csv("TestData.csv")
validation_df = pd.read_csv("ValidationData.csv")

In [4]:
x_train = train_df['lemmatized_text']
y_train = train_df['label']

x_validation = validation_df['lemmatized_text']
y_validation = validation_df['label']

x_test = test_df['lemmatized_text']
y_test = test_df['label']

In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_train = label_encoder.fit_transform(y_train)
y_validation = label_encoder.transform(y_validation)
y_test = label_encoder.transform(y_test)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

vectorizer = TfidfVectorizer(stop_words = 'english', max_features=1500)
x_train = vectorizer.fit_transform(x_train.astype('U'))
x_validation = vectorizer.transform(x_validation.astype('U'))
x_test = vectorizer.transform(x_test.astype('U'))

In [7]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train.toarray())
x_validation = scaler.transform(x_validation.toarray())
x_test = scaler.transform(x_test.toarray())

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

kneighbors_model = KNeighborsClassifier(n_neighbors=10, weights='uniform')
gaussiannb_model = GaussianNB()
nn_model = MLPClassifier((), activation='logistic', verbose=1, batch_size=512, max_iter=50, warm_start=True)
#xgboost_model = xgb.XGBClassifier(max_depth=10, n_estimators=100, learning_rate=0.1, colsample_bytree=0.5, subsample=0.5, n_jobs=-1, verbosity=1)
decisiontree_model = DecisionTreeClassifier(max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=42)

In [9]:
kneighbors_model.fit(x_train, y_train)
gaussiannb_model.fit(x_train, y_train)
nn_model.fit(x_train, y_train)
#xgboost_model.fit(x_train, y_train)
decisiontree_model.fit(x_train, y_train)

Iteration 1, loss = 1.82839996
Iteration 2, loss = 1.45653795
Iteration 3, loss = 1.41345701
Iteration 4, loss = 1.39078836
Iteration 5, loss = 1.37612235
Iteration 6, loss = 1.36536211
Iteration 7, loss = 1.35781288
Iteration 8, loss = 1.35197974
Iteration 9, loss = 1.34773770
Iteration 10, loss = 1.34411368
Iteration 11, loss = 1.34165982
Iteration 12, loss = 1.33945112
Iteration 13, loss = 1.33734788
Iteration 14, loss = 1.33567875
Iteration 15, loss = 1.33479536
Iteration 16, loss = 1.33386873
Iteration 17, loss = 1.33218840
Iteration 18, loss = 1.33166002
Iteration 19, loss = 1.33115716
Iteration 20, loss = 1.33059619
Iteration 21, loss = 1.32962825
Iteration 22, loss = 1.32927377
Iteration 23, loss = 1.32870703
Iteration 24, loss = 1.32847379
Iteration 25, loss = 1.32791327
Iteration 26, loss = 1.32784429
Iteration 27, loss = 1.32728447
Iteration 28, loss = 1.32682559
Iteration 29, loss = 1.32670234
Iteration 30, loss = 1.32619317
Iteration 31, loss = 1.32591213
Iteration 32, los



In [11]:
from sklearn.metrics import accuracy_score

print("K Neighbors Validation Accuracy: %.3f%%" % (100 * accuracy_score(y_validation, kneighbors_model.predict(x_validation))))
print("Gaussian NB Validation Accuracy: %.3f%%" % (100 * accuracy_score(y_validation, gaussiannb_model.predict(x_validation))))
print("Neural Network Validation Accuracy: %.3f%%" % (100 * accuracy_score(y_validation, nn_model.predict(x_validation))))
#print("XGBoost Validation Accuracy: %.3f%%" % (100 * accuracy_score(y_validation, xgboost_model.predict(x_validation))))
print("Decision Tree Validation Accuracy: %.3f%%" % (100 * accuracy_score(y_validation, decisiontree_model.predict(x_validation))))

K Neighbors Validation Accuracy: 18.445%
Gaussian NB Validation Accuracy: 11.125%
Neural Network Validation Accuracy: 54.480%
Decision Tree Validation Accuracy: 32.957%


In [12]:
tqdm.pandas()

predictions = nn_model.predict_proba(x_test)
predictions = np.argsort(predictions, axis=1)[:,-3:]
class_labels = nn_model.classes_

prediction_df = pd.DataFrame(predictions, columns=['Third', 'Second', 'First'])
prediction_df['First'] = class_labels[prediction_df['First']]
prediction_df['Second'] = class_labels[prediction_df['Second']]
prediction_df['Third'] = class_labels[prediction_df['Third']]
prediction_df['Correct Label'] = y_test
prediction_df['In Top 3'] = False

prediction_df['Correct'] = prediction_df.progress_apply(lambda row: True if row['Correct Label'] == row['First'] else False, axis=1)
prediction_df['In Top 3'] = prediction_df.progress_apply(lambda row: row['Correct Label'] in row[['First', 'Second', 'Third']].values, axis=1)

print(prediction_df.head())

# find percent where Correct = First
print("Percent where Correct = First: %.3f%%" % (100 * len(prediction_df[prediction_df['Correct'] == True])/len(prediction_df)))
print("Percent where Correct in Top 3: %.3f%%" % (100 * len(prediction_df[prediction_df['In Top 3'] == True])/len(prediction_df)))

100%|██████████| 47416/47416 [00:00<00:00, 160374.68it/s]
100%|██████████| 47416/47416 [00:12<00:00, 3806.78it/s]

   Third  Second  First  Correct Label  In Top 3  Correct
0     32      28     33             31     False    False
1      7       2     52              2      True    False
2     32      31     33             33      True     True
3     13      14     46             46      True     True
4     43      33     17             33      True    False
Percent where Correct = First: 54.144%
Percent where Correct in Top 3: 82.139%





In [18]:
# save neural net model to .pkl
import pickle
pickle.dump(nn_model, open("nn_model.pkl", "wb"))
pd.DataFrame(nn_model.predict_proba(x_test)).to_csv("prob.csv", index=False)
pd.DataFrame(y_test).to_csv("y_test.csv", index=False)