In [11]:
import pandas as pd
import numpy as np 
import re
from collections import Counter
import itertools
from pprint import pprint
import json 
import requests
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from tqdm.auto import tqdm

In [2]:
raw_test = pd.read_excel("validation_dataset.xlsx")#.loc[0:10000,:]
raw_train = pd.read_excel("training_dataset.xlsx")#.loc[0:10000,:]
raw_train.head()

Unnamed: 0,text,label
0,agent says for contacting best buy this is nam...,troubleshooting
1,agent says name. may i please have your full n...,schedule installation
2,agent says thank you very much for calling bes...,trade in inquiry
3,agent says this um sorry can i ask for your co...,account security
4,agent says thank you for contacting best buy t...,product availability and stock


In [3]:
# Tokenize the text: split each string into words
words = raw_train.loc[0:5000, 'text'].str.split().tolist()

# Flatten the list of words
all_words = list(itertools.chain.from_iterable(words))

# Count the frequencies
word_counts = Counter(all_words)

# Get the most common words
most_common_words = word_counts.most_common()  # You can specify a number inside most_common(n)

# Display the most common words
pprint(most_common_words)

[('says', 258560),
 ('the', 103232),
 ('i', 101055),
 ('you', 93604),
 ('and', 66416),
 ('to', 65313),
 ('that', 51070),
 ('uh', 46468),
 ('it', 45216),
 ('a', 43752),
 ('for', 42723),
 ('is', 42388),
 ('so', 41271),
 ('okay', 34630),
 ('have', 32309),
 ('um', 28522),
 ('your', 27206),
 ('can', 26475),
 ('name', 25713),
 ('in', 24521),
 ('me', 24225),
 ('this', 24123),
 ('on', 23574),
 ('just', 22725),
 ('thank', 22396),
 ('we', 20992),
 ('my', 20522),
 ('its', 19569),
 ('be', 19525),
 ('im', 17804),
 ('if', 17178),
 ('do', 17115),
 ('date', 16973),
 ('of', 16907),
 ('yeah', 16006),
 ('with', 15457),
 ('best', 14467),
 ('but', 14409),
 ('they', 14264),
 ('like', 13896),
 ('what', 13239),
 ('number', 13209),
 ('was', 13121),
 ('get', 12814),
 ('or', 12740),
 ('not', 12409),
 ('buy', 11915),
 ('will', 11881),
 ('know', 11870),
 ('thats', 11830),
 ('no', 11780),
 ('let', 11733),
 ('phone', 10865),
 ('yes', 10638),
 ('dont', 10535),
 ('one', 10250),
 ('see', 10095),
 ('voice.creditcard.dig

In [4]:

with open("RemovedWords.json", "w+") as file:
    json.dump(list(np.array(most_common_words)[:,0]), file)

In [3]:
with open("FormattedWords.json", "r+") as file:
    most_common_words = json.load(file)

In [4]:
train_df = raw_train.copy()
test_df = raw_test.copy()

# List of words to remove
words_to_remove = list(np.array(most_common_words)[0:250,0])  # Add your words here

# Create a regular expression pattern
regex_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'

# Remove the words from the DataFrame column
train_df['text'] = raw_train['text'].str.strip().replace(regex_pattern, '', regex=True)
test_df['text'] = raw_test['text'].str.strip().replace(regex_pattern, '', regex=True)
print("Removed Common Words!")

# Replacing repeating words and one letter words
test_df['text'] = test_df['text'].str.replace(r'\b(\w)(\W*\1)*\b', '', regex=True)
train_df['text'] = train_df['text'].str.replace(r'\b(\w)(\W*\1)*\b', '', regex=True)
print("Removed Repeating words and one letter words")

# Remove sequences of spaces and all punctuation
train_df['text'] = train_df['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
train_df['text'] = train_df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove all punctuation except spaces and word characters

test_df['text'] = test_df['text'].str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
test_df['text'] = test_df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove all punctuation except spaces and word characters



train_df['text']

Removed Common Words!
Removed Repeating words and one letter words


0          contacting started full disconnected   apprec...
1           full  marjorie shady geek squad installation...
2          first telephone account account problem exact...
3          complete  nita green color    while connect c...
4          contacting chad started       curious cart li...
                                ...                        
379323     full  pidesky   disconnected   email home  tw...
379324      trisha   ever gets disconnected  gets discon...
379325       brooks peterson   disconnected  pull accoun...
379326     started full disconnected dead wanted  apolog...
379327     disconnected  takedreen cooling  request weir...
Name: text, Length: 379328, dtype: object

In [15]:
nltk.download('punkt')
tqdm.pandas()

# Create instances of lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    lem_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lem_words)

def stem_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    stem_words = [stemmer.stem(word) for word in words]
    return ' '.join(stem_words)

train_df['stemmed_text'] = train_df['text'].progress_apply(lemmatize_sentence)
train_df['lemmatized_text'] = train_df['text'].progress_apply(stem_sentence)

test_df['stemmed_text'] = test_df['text'].progress_apply(lemmatize_sentence)
test_df['lemmatized_text'] = test_df['text'].progress_apply(stem_sentence)

[nltk_data] Downloading package punkt to /Users/owner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/379328 [00:00<?, ?it/s]

  0%|          | 0/379328 [00:00<?, ?it/s]

  0%|          | 0/47416 [00:00<?, ?it/s]

  0%|          | 0/47416 [00:00<?, ?it/s]

In [16]:
train_df.to_csv("TrainData.csv", index=False)
test_df.to_csv("TestData.csv", index=False)

In [7]:
train_df = pd.read_csv("TrainData.csv")
test_df = pd.read_csv("TestData.csv")

In [18]:
x_train = train_df['lemmatized_text']
y_train = train_df['label']

x_test = test_df['lemmatized_text']
y_test = test_df['label']

In [19]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

vectorizer = TfidfVectorizer(stop_words = 'english', max_features=1500)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [21]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train.toarray())
x_test = scaler.transform(x_test.toarray())

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#model = KNeighborsClassifier(n_neighbors=10, weights='uniform')
model = MLPClassifier((), activation='logistic', verbose=1, batch_size=512, max_iter=50, warm_start=True)

In [36]:
model.fit(x_train, y_train)

Iteration 1, loss = 1.78315482
Iteration 2, loss = 1.40246928
Iteration 3, loss = 1.35592989
Iteration 4, loss = 1.33244231
Iteration 5, loss = 1.31759368
Iteration 6, loss = 1.30715329
Iteration 7, loss = 1.29975580
Iteration 8, loss = 1.29466391
Iteration 9, loss = 1.29057066
Iteration 10, loss = 1.28772036
Iteration 11, loss = 1.28475042
Iteration 12, loss = 1.28316586
Iteration 13, loss = 1.28121979
Iteration 14, loss = 1.28022883
Iteration 15, loss = 1.27926456
Iteration 16, loss = 1.27811811




In [37]:
from sklearn.metrics import accuracy_score

print("Train Accuracy: %.3f%%" % (100 * accuracy_score(y_train, model.predict(x_train))))
print("Test  Accuracy: %.3f%%" % (100 * accuracy_score(y_test, model.predict(x_test))))

Train Accuracy: 60.480%
Test  Accuracy: 56.660%
