In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [15]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords')

# Download the stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Loading datasets
train_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/train.csv')
test_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/test.csv')

In [17]:
# Print datatypes
print(train_df.dtypes)
print(test_df.dtypes)

# Convert to string
train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)

train_df.head()

text    object
dtype: object
text    object
dtype: object


Unnamed: 0,text
0,omg guys hello father may please acquire nouri...
1,suicidal people sub if need talk messege im
2,existential question the opposite ptsd memory
3,can imagine getting old me neitherwrinkles wei...
4,imagine rich lmao sounds kinda like control wo...


In [18]:
test_df.head()

Unnamed: 0,text
0,seeing everyone else better real trigger mebas...
1,why cant i ever normalim always pain i dont th...
2,i wasted lifei basically wasted life i never b...
3,the voice tells kill selfthe voice abuser cons...
4,i figured today complete survey get gems downl...


In [19]:
# Preprocessing the data
# Removing the stopwords
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Stemming
stemmer = PorterStemmer()
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Tokenization
tokenizer = TweetTokenizer()
train_df['text'] = train_df['text'].apply(lambda x: tokenizer.tokenize(x))
test_df['text'] = test_df['text'].apply(lambda x: tokenizer.tokenize(x))


In [20]:
# Convert the list of tokens into a string
train_df['text'] = train_df['text'].apply(lambda x: ' '.join(x))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join(x))

# TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
train_vectors = vectorizer.fit_transform(train_df['text'])
test_vectors = vectorizer.transform(test_df['text'])


In [21]:
# Check the shape of the vectors
print(train_vectors.shape)
print(test_vectors.shape)

(22500, 1000)
(7500, 1000)


In [22]:
# Convert the vectors to a dataframe
train_vectors_df = pd.DataFrame(train_vectors.toarray())
test_vectors_df = pd.DataFrame(test_vectors.toarray())

train_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.180286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Store the vectors in a csv file
train_vectors_df.to_csv('train_tfidf.csv')
test_vectors_df.to_csv('test_tfidf.csv')