### Text Tweet Classification

In [172]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import autoreload
import csv
import os
import re

from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

sb.set()
sb.set_style("darkgrid")

pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 100)

In [75]:
df = pd.read_excel(r"D:\Open Classroom\Datasets\Text (Tweet) Classification\text_classification_dataset.xlsx")
df.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


In [76]:
df.shape
# Contains 1162 - Rows
#          2 - Columns

(1162, 2)

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162 entries, 0 to 1161
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1162 non-null   object
 1   type    1162 non-null   object
dtypes: object(2)
memory usage: 18.3+ KB


In [78]:
df.describe()

Unnamed: 0,text,type
count,1162,1162
unique,1162,4
top,me to my family members: “I want to go into po...,politics
freq,1,345


In [79]:
df.isnull().sum()
# Doesnt contain any Null values

text    0
type    0
dtype: int64

In [80]:
df[df.duplicated() == True]
# No duplicates in the dataset

Unnamed: 0,text,type


Replace special string characters using regresion analysis

In [81]:
for i in range(len(df['text'])):
    df['text'][i]=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', df['text'][i], flags=re.MULTILINE)

In [82]:
df.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


In [83]:
for i in range(len(df['text'])):
    df['text'][i] = re.sub('@([a-zA-Z])+','', df['text'][i], flags = re.MULTILINE)

In [84]:
df.head()

Unnamed: 0,text,type
0,2012 920 Never knew having 1 or 2 followers ha...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's finalists - ...,sports
3,Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


#### One-Hot Encoding

In [85]:
le = LabelEncoder()

In [142]:
df["type_num"] = le.fit_transform(df["type"])
df.head()

Unnamed: 0,text,type,type_num
0,2012 920 Never knew having 1 or 2 followers ha...,sports,3
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports,3
2,The current state of last year's finalists - ...,sports,3
3,Why did you bring a cricket...,sports,3
4,Babar Azam only Pakistani included in the ICC ...,sports,3


In [143]:
df["type"].value_counts()

politics         345
medical          299
entertainment    260
sports           258
Name: type, dtype: int64

In [141]:
df[["type", "type_num"]].sample(frac = 1).head()

Unnamed: 0,type,type_num
31,sports,3
1064,sports,3
826,politics,2
592,medical,1
209,entertainment,0


#### Train and Test Data

Train the Dataset to Learn the Vocabulary [Document-term Matrix] and Testing the Dataset

In [88]:
x = df["text"]
y = df["type_num"]

In [89]:
%timeit x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=4)

1.22 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [90]:
print("x train :", x_train.shape)
print("x test :", x_test.shape)
print("y train :", y_train.shape)
print("y test :", y_test.shape)

x train : (929,)
x test : (233,)
y train : (929,)
y test : (233,)


#### Vectorization

In [98]:
vect = CountVectorizer()
%timeit vect.fit(x_train)

33.3 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [102]:
x_train_vect = vect.fit_transform(x_train) # Document-Term Matrix
x_train_vect

<929x4707 sparse matrix of type '<class 'numpy.int64'>'
	with 14873 stored elements in Compressed Sparse Row format>

x_train_vect has 929 observations and 4707 tokens

In [93]:
x_test_vect = vect.fit_transform(x_test)
x_test_vect

<233x1786 sparse matrix of type '<class 'numpy.int64'>'
	with 3826 stored elements in Compressed Sparse Row format>

x_test_vect has 233 observations and 1786 tokens

#### Build and Evaluate a Model

1. Multinomial Naive Bayes

In [118]:
nb_model = MultinomialNB()
nb_model.fit(x_train_vect, y_train)

MultinomialNB()

In [178]:
nb_pred = nb_model.predict(x_train_vect)
nb_pred

array([2, 0, 1, 3, 2, 0, 1, 2, 3, 2, 0, 1, 1, 1, 1, 1, 0, 1, 3, 0, 3, 1,
       3, 3, 3, 2, 3, 3, 2, 0, 2, 3, 2, 2, 2, 1, 2, 0, 3, 1, 1, 0, 2, 3,
       2, 2, 0, 2, 0, 1, 3, 3, 3, 0, 1, 1, 3, 0, 1, 0, 2, 2, 0, 1, 1, 0,
       1, 2, 2, 1, 0, 2, 3, 0, 2, 3, 1, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2,
       0, 1, 1, 2, 1, 1, 2, 3, 0, 0, 3, 3, 2, 2, 2, 3, 3, 1, 1, 2, 1, 2,
       2, 3, 0, 2, 0, 1, 3, 2, 2, 1, 1, 3, 1, 2, 1, 2, 3, 2, 3, 0, 2, 3,
       1, 2, 1, 2, 0, 2, 3, 1, 2, 1, 1, 3, 2, 3, 3, 3, 2, 0, 0, 0, 3, 0,
       1, 1, 3, 1, 0, 0, 0, 3, 2, 3, 1, 0, 1, 3, 2, 2, 0, 0, 2, 0, 0, 2,
       1, 2, 0, 0, 1, 3, 3, 0, 1, 1, 0, 3, 3, 1, 0, 0, 0, 3, 0, 0, 1, 2,
       0, 0, 2, 3, 1, 1, 2, 1, 2, 2, 2, 3, 3, 0, 1, 2, 0, 3, 2, 0, 3, 2,
       1, 3, 2, 3, 1, 3, 1, 2, 2, 3, 2, 2, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1,
       3, 2, 1, 2, 0, 0, 1, 1, 3, 2, 0, 1, 0, 2, 1, 2, 1, 3, 0, 2, 3, 1,
       2, 0, 1, 2, 3, 0, 3, 3, 0, 3, 2, 1, 2, 3, 1, 3, 3, 1, 0, 0, 0, 2,
       0, 1, 1, 1, 3, 2, 0, 3, 2, 2, 2, 2, 3, 1, 2,

In [179]:
nb_score = nb_model.score(x_train_vect, y_train)
nb_score

0.9967707212055974

In [180]:
print("             Politics     Medical       Entertainment          Sports")
prob_log = nb_model.predict_proba(x_train_vect)
prob_log

             Politics     Medical       Entertainment          Sports


array([[2.17644274e-04, 1.10121623e-03, 9.98562247e-01, 1.18892579e-04],
       [9.99688011e-01, 1.04895054e-05, 8.67116470e-07, 3.00632389e-04],
       [2.62061237e-06, 9.99996323e-01, 5.83388659e-07, 4.72699042e-07],
       ...,
       [8.56124635e-08, 9.99970829e-01, 7.31488638e-06, 2.17701768e-05],
       [9.86291006e-01, 2.33815818e-05, 2.56623855e-05, 1.36599501e-02],
       [1.86862636e-04, 1.14035108e-03, 3.25825235e-05, 9.98640204e-01]])

2. Decision Trees

In [181]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train_vect, y_train)

DecisionTreeClassifier()

In [182]:
dt_pred = dt_model.predict(x_train_vect)
dt_pred

array([2, 0, 1, 3, 2, 0, 1, 2, 3, 2, 0, 1, 1, 1, 1, 1, 0, 1, 3, 0, 3, 1,
       3, 3, 3, 2, 3, 3, 2, 0, 2, 3, 2, 2, 2, 1, 2, 0, 3, 1, 1, 0, 2, 3,
       2, 2, 0, 2, 0, 1, 3, 3, 3, 0, 1, 1, 3, 0, 1, 0, 2, 2, 0, 1, 1, 0,
       1, 2, 2, 1, 0, 2, 3, 0, 2, 3, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2,
       0, 1, 1, 2, 1, 1, 2, 3, 0, 0, 3, 3, 2, 2, 2, 3, 3, 1, 1, 2, 1, 2,
       2, 3, 0, 2, 0, 1, 3, 2, 2, 1, 1, 3, 1, 2, 1, 2, 3, 2, 3, 0, 2, 3,
       1, 2, 1, 2, 0, 2, 3, 1, 2, 1, 1, 3, 2, 3, 3, 3, 2, 0, 0, 0, 3, 0,
       1, 1, 3, 1, 0, 0, 0, 3, 2, 3, 1, 0, 1, 3, 2, 2, 0, 0, 2, 0, 0, 2,
       1, 2, 0, 0, 1, 3, 3, 0, 1, 1, 0, 3, 3, 1, 0, 0, 0, 3, 0, 0, 1, 2,
       0, 0, 2, 3, 1, 1, 2, 1, 2, 2, 2, 3, 3, 0, 1, 2, 0, 3, 2, 0, 3, 2,
       1, 3, 2, 3, 1, 3, 1, 2, 2, 3, 2, 2, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1,
       3, 2, 1, 2, 0, 0, 1, 1, 3, 2, 0, 1, 0, 2, 1, 2, 1, 3, 0, 2, 3, 1,
       2, 0, 1, 2, 3, 0, 3, 3, 0, 3, 2, 1, 2, 3, 1, 3, 3, 1, 0, 0, 0, 2,
       0, 2, 1, 1, 3, 2, 0, 3, 2, 2, 2, 2, 3, 1, 2,

In [185]:
dt_score = dt_model.score(x_train_vect, y_train)
dt_score

1.0

In [203]:
dt_prob = dt_model.predict_proba(x_train_vect)
print(dt_prob)

[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 ...
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]
