In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import html
import re
import string

In [2]:
df = pd.read_csv('/content/drive/MyDrive/AMIT_AI/KAGGLE/DATA_SET/twitter sentiment/twitter_training.csv')

In [3]:
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
df.duplicated().sum()

2700

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.isnull().sum()

Unnamed: 0,0
2401,0
Borderlands,0
Positive,0
"im getting on borderlands and i will murder you all ,",326


In [8]:
df.dropna(inplace=True)

In [9]:
df['Positive'].value_counts()

Unnamed: 0_level_0,count
Positive,Unnamed: 1_level_1
Negative,21698
Positive,19712
Neutral,17708
Irrelevant,12537


In [10]:
X = df.iloc[:,3]
df['Positive']  = df['Positive'].map({'Positive':1,'Negative':0, 'Neutral':2, 'Irrelevant':3})
y = df['Positive']

In [11]:
X

Unnamed: 0,"im getting on borderlands and i will murder you all ,"
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,Just realized that the Windows partition of my...
74677,Just realized that my Mac window partition is ...
74678,Just realized the windows partition of my Mac ...
74679,Just realized between the windows partition of...


## Preprocessing




In [12]:
import re

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"'s", "is", text)
    text = re.sub(r"'ll", "will", text)
    text = re.sub(r"'ve", "have", text)
    text = re.sub(r"'re", "are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    return text

In [13]:
X_pre = X.apply(preprocess_text)

In [14]:
X[6]

"So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a wallpaper for my PC.. Here's the original picture compared to the creation I made:) Have fun! pic.twitter.com / mLsI5wf9Jg"

In [15]:
X_pre[6]

'so i spent a couple of hours doing something for fun if you dont know that im a huge  borderlands fan and maya is one of my favorite characters i decided to make a wallpaper for my pc heres the original picture compared to the creation i made have fun pictwittercom  mlsiwfjg'

In [16]:
# pass X_pre content into list of documents
documents = X_pre.tolist()

In [17]:
documents

['i am coming to the borders and i will kill you all',
 'im getting on borderlands and i will kill you all',
 'im coming on borderlands and i will murder you all',
 'im getting on borderlands  and i will murder you me all',
 'im getting into borderlands and i can murder you all',
 'so i spent a few hours making something for fun   if you dont know i am a huge borderlands fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc  here is the original image versus the creation i made  enjoy pictwittercommlsiwfjg',
 'so i spent a couple of hours doing something for fun if you dont know that im a huge  borderlands fan and maya is one of my favorite characters i decided to make a wallpaper for my pc heres the original picture compared to the creation i made have fun pictwittercom  mlsiwfjg',
 'so i spent a few hours doing something for fun if you dont know im a huge  borderlands fan and maya is one of my favorite characters',
 'so i spent a few hours ma

## Tokenization

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [19]:

vocabulary = set(word for tokens in documents for word in tokens.split())
vocabulary = sorted(vocabulary)
word_to_index = {word: i for i, word in enumerate(vocabulary)}

## High level BoW

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_bow_matrix = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(X_bow_matrix.toarray()) # Print the transformed matrix

['aa' 'aaa' 'aaaaaaaaaaaa' ... 'zzmhpax' 'zzvfsrhewg' 'zzz']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Row Implementation of BoW

In [32]:
# small document sample to apply row implementation
small_documents = documents[:50]

In [33]:
from collections import Counter
ri_bow_matrix = []

for sent in small_documents:
    word_count = Counter(sent.lower().split())
    bow_vector = [word_count[word] for word in vocabulary]
    ri_bow_matrix.append(bow_vector)

print('Vocab', vocabulary)
print('BoW Matrix', ri_bow_matrix)

BoW Matrix [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Using Pandas

In [34]:
p_bow_data = []

for doc in small_documents:
  word_counter = Counter(doc.lower().split())
  p_bow_data.append({word: word_counter.get(word, 0) for word in vocabulary})

bow_df = pd.DataFrame(p_bow_data, columns=vocabulary)
print(bow_df)

    a  aa  aaa  aaaaaaaaaaaa  aaaaaaaaaaaaa  \
0   0   0    0             0              0   
1   0   0    0             0              0   
2   0   0    0             0              0   
3   0   0    0             0              0   
4   0   0    0             0              0   
5   3   0    0             0              0   
6   3   0    0             0              0   
7   2   0    0             0              0   
8   3   0    0             0              0   
9   3   0    0             0              0   
10  0   0    0             0              0   
11  0   0    0             0              0   
12  0   0    0             0              0   
13  0   0    0             0              0   
14  0   0    0             0              0   
15  0   0    0             0              0   
16  0   0    0             0              0   
17  2   0    0             0              0   
18  2   0    0             0              0   
19  2   0    0             0              0   
20  2   0    

## High-Level TF-IDF

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_tfidf_matrix = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(X_tfidf_matrix.toarray()) # Print the transformed matrix

['aa' 'aaa' 'aaaaaaaaaaaa' ... 'zzmhpax' 'zzvfsrhewg' 'zzz']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
