<a href="https://colab.research.google.com/github/arutraj/.githubcl/blob/main/5_Text_Representation_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents

### I. Loading and Preprocessing Data
### II. Creating Text Representations
> ##### 1. Bag Of Words
> ##### 2. TF-IDF

# I. Loading and Preprocessing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
# Import library
import numpy as np
import pandas as pd

In [None]:
# Read dataset
df = pd.read_csv(r'/content/tweets.csv')

In [None]:
# Print dataset
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0.0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331.0,True,False
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0.0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66.0,True,False
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0.0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12.0,True,False
3,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0.0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338.0,True,False
4,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0.0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120.0,True,False


In [None]:
# Only keep text column
df.columns

Index(['text', 'favorited', 'favoriteCount', 'replyToSN', 'created',
       'truncated', 'replyToSID', 'id', 'replyToUID', 'statusSource',
       'screenName', 'retweetCount', 'isRetweet', 'retweeted'],
      dtype='object')

In [None]:
drop_cols = ['favorited', 'favoriteCount', 'replyToSN', 'created',
       'truncated', 'replyToSID', 'id', 'replyToUID', 'statusSource',
       'screenName', 'retweetCount', 'isRetweet', 'retweeted']
df.drop(columns = drop_cols, inplace = True)

In [None]:
drop_cols = ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone']

df.drop(columns = drop_cols, inplace = True)

KeyError: "['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'] not found in axis"

In [20]:
# Print dataset
df.head()

Unnamed: 0,text,text_clean
0,RT @rssurjewala: Critical question: Was PayTM ...,RT rssurjewala critical question be PayTM info...
1,RT @Hemant_80: Did you vote on #Demonetization...,RT Hemant do you vote on Demonetization on Mod...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",RT roshankar former finsec RBI Dy Governor CBD...
3,RT @ANI_news: Gurugram (Haryana): Post office ...,RT ANI news Gurugram Haryana Post office emplo...
4,RT @satishacharya: Reddy Wedding! @mail_today ...,RT satishacharya Reddy Wedding mail today cart...


# II. Creating Text Representations

## Bag of Words

In [None]:
# Import BoW function from sklearn
from sklearn.feature_extraction.text import CountVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
# Creating an object for the vectorizer
word_bow = CountVectorizer()

In [None]:
# Fit on training data
word_bow.fit(df['text'].values)

In [None]:
# Transform the training data
word_vectors_bow = word_bow.transform(df['text'].values)

In [None]:
# Features
word_bow.get_feature_names_out()

array(['00', '000', '00716', ..., 'zzh5moxrtq', 'zzthdwqbfy',
       'zzyjzzuhlu'], dtype=object)

In [None]:
# Shape of the matrix
# 5157 documents and 13541 unique words
word_vectors_bow

<5157x13541 sparse matrix of type '<class 'numpy.int64'>'
	with 86437 stored elements in Compressed Sparse Row format>

In [None]:
# Document representation
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,00,000,00716,0080,0081,0082,0083,0084,0085,0086,...,zxiusza2s7,zxuecwobqp,zyitjkbklc,zylu2al27f,zymrlzofxm,zyuakjdi4h,zz0mflmpfd,zzh5moxrtq,zzthdwqbfy,zzyjzzuhlu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Reducing sparsity

### 1. Preprocessing document text

In [None]:
# Import relevant libraries
import spacy
import re

# Load English language model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Preprocessing function
def clean(text):

    # Remove alphabetic words
    text = ' '.join(re.compile(r'[^a-zA-Z]+').split(text))

    # Create spacy object
    doc = nlp(text)

    # List to store clean text
    filtered_text = []

    # Iterate over document and save word lemmas
    for token in doc:
        filtered_text.append(token.lemma_)

    return " ".join(word for word in filtered_text)

In [18]:
# Apply function
df['text_clean'] = df['text'].apply(clean)

In [21]:
# Print dataset
df.head(10)

Unnamed: 0,text,text_clean
0,RT @rssurjewala: Critical question: Was PayTM ...,RT rssurjewala critical question be PayTM info...
1,RT @Hemant_80: Did you vote on #Demonetization...,RT Hemant do you vote on Demonetization on Mod...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",RT roshankar former finsec RBI Dy Governor CBD...
3,RT @ANI_news: Gurugram (Haryana): Post office ...,RT ANI news Gurugram Haryana Post office emplo...
4,RT @satishacharya: Reddy Wedding! @mail_today ...,RT satishacharya Reddy Wedding mail today cart...
5,@DerekScissors1: Indias #demonetization: #Bla...,DerekScissors India s demonetization Blackmo...
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,RT gauravcsawant Rs lakh loot from a bank in K...
7,RT @Joydeep_911: Calling all Nationalists to j...,RT Joydeep call all Nationalists to join Walk ...
8,RT @sumitbhati2002: Many opposition leaders ar...,RT sumitbhati many opposition leader be with n...
9,National reform now destroyed even the essence...,national reform now destroy even the essence o...


In [22]:
# Arguments: default values
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           )

In [23]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [24]:
# Shape of the matrix
word_vectors_bow

<5157x12813 sparse matrix of type '<class 'numpy.int64'>'
	with 86053 stored elements in Compressed Sparse Row format>

In [25]:
# Document representation
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa,aaadhar,aaanupriyaaa,aadhaar,aadhar,aadhe,aadityagautom,aadmi,aagr,aaj,...,zymrlzofxm,zynql,zyuakjdi,zz,zzdxhds,zzh,zzl,zzthdwqbfy,zzygw,zzyjzzuhlu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Keep top frequent terms

In [27]:
# Update arguments just with top 5000
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           max_features=5000,  # Max features
                           )

In [28]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [29]:
# Shape of the matrix
word_vectors_bow

<5157x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 78074 stored elements in Compressed Sparse Row format>

In [30]:
# Document representation
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa,aadhaar,aadhar,aadhe,aadmi,aajtak,aam,aamaadmi,aamaadmiparty,aamir,...,zt,zu,zv,zvup,zvvbjg,zwcmfzca,zxhhmuwceq,zyitjkbklc,zylu,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. Thresholding the occurrence of terms

In [31]:
# Update arguments
word_bow = CountVectorizer(binary=False,  # Count the occurrences of the terms
                           lowercase=True,  # Lowercase
                           max_df=500,  # Max occurrence
                           min_df=10,  # Min occurrence
                           )

In [32]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [33]:
# Shape of the matrix
word_vectors_bow

<5157x967 sparse matrix of type '<class 'numpy.int64'>'
	with 41103 stored elements in Compressed Sparse Row format>

In [34]:
# Document representation
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa,aadhaar,aadmi,aam,aamaadmiparty,aap,able,about,abt,accept,...,yet,yogi,you,young,your,youtube,youtubers,yrdeshmukh,yt,zone
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. N-gram BoW

In [35]:
# Update arguments
word_bow = CountVectorizer(binary=False,  # Count the occurances of the terms
                           lowercase=True,  # Lowercase
                           ngram_range=(2,2)  # bi-gram
                           )

In [36]:
# Fit and transform training data
word_vectors_bow = word_bow.fit_transform(df['text_clean'].values)

In [37]:
# Shape of the matrix
word_vectors_bow

<5157x42679 sparse matrix of type '<class 'numpy.int64'>'
	with 84906 stored elements in Compressed Sparse Row format>

In [38]:
# Features
word_bow.get_feature_names_out()

array(['aa gaye', 'aa https', 'aa lfy', ..., 'zzh moxrtq', 'zzl offa',
       'zzygw em'], dtype=object)

In [39]:
# Document representation
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns=vocab)

Unnamed: 0,aa gaye,aa https,aa lfy,aa lvsy,aa mazc,aa pje,aa rahe,aa to,aa yogi,aaadhar expansion,...,zwpql frwn,zwsoa google,zyitjkbklc via,zylu al,zymrlzofxm https,zz al,zz mflmpfd,zzh moxrtq,zzl offa,zzygw em
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
word_bow.get_feature_names_out()

array(['aa gaye', 'aa https', 'aa lfy', ..., 'zzh moxrtq', 'zzl offa',
       'zzygw em'], dtype=object)

In [42]:
vocab = word_bow.get_feature_names_out()
pd.DataFrame(word_vectors_bow.toarray(), columns = vocab)

Unnamed: 0,aa gaye,aa https,aa lfy,aa lvsy,aa mazc,aa pje,aa rahe,aa to,aa yogi,aaadhar expansion,...,zwpql frwn,zwsoa google,zyitjkbklc via,zylu al,zymrlzofxm https,zz al,zz mflmpfd,zzh moxrtq,zzl offa,zzygw em
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF

In [43]:
# Import tfidf vectorizer function from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [44]:
# Creating an object for the vectorizer
word_tfidf = TfidfVectorizer()

In [45]:
# Fit and transform training data
word_vectors_tfidf = word_tfidf.fit_transform(df['text_clean'].values)

In [46]:
# Shape of the matrix
# Same number of features
word_vectors_tfidf

<5157x12813 sparse matrix of type '<class 'numpy.float64'>'
	with 86053 stored elements in Compressed Sparse Row format>

In [47]:
# Document representation
vocab = word_tfidf.get_feature_names_out()
pd.DataFrame(word_vectors_tfidf.toarray(), columns=vocab)

Unnamed: 0,aa,aaadhar,aaanupriyaaa,aadhaar,aadhar,aadhe,aadityagautom,aadmi,aagr,aaj,...,zymrlzofxm,zynql,zyuakjdi,zz,zzdxhds,zzh,zzl,zzthdwqbfy,zzygw,zzyjzzuhlu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244035,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modifying argument values

In [48]:
# Update arguments
word_tfidf = TfidfVectorizer(ngram_range=(2,2))  # bi-grams

In [49]:
# Fit and transform training data
word_vectors_tfidf = word_tfidf.fit_transform(df['text_clean'].values)

In [50]:
# Shape of the matrix
word_vectors_tfidf

<5157x42679 sparse matrix of type '<class 'numpy.float64'>'
	with 84906 stored elements in Compressed Sparse Row format>

In [51]:
vocab = word_tfidf.get_feature_names_out()
pd.DataFrame(word_vectors_tfidf.toarray(), columns=vocab)

Unnamed: 0,aa gaye,aa https,aa lfy,aa lvsy,aa mazc,aa pje,aa rahe,aa to,aa yogi,aaadhar expansion,...,zwpql frwn,zwsoa google,zyitjkbklc via,zylu al,zymrlzofxm https,zz al,zz mflmpfd,zzh moxrtq,zzl offa,zzygw em
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
