# Assignment 5 - Naive Bayes
## Raja Jain | 2022-02-27

In [1]:
import string
from pprint import pprint as pp
import docx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sweetviz as sv
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pandarallel import pandarallel
from scipy.stats import norm
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler


def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return "\n".join(fullText)


In [2]:
text = getText("Assignment 5 - Naive Bayes.docx")
print(text)


COMP 4448: Data Science Tools II						          Assignment 5
Directions: Do this assignment in Jupyter Notebook and provide screenshots of code and output in this word document wherever required. You will upload this word document containing screenshots of code and answers as well as your Jupyter Notebook to Canvas. All assignments will be submitted and graded through canvas and grades will be transferred to the 2U platform. 
Goal: The goal of this assignment is to give you the opportunity to implement the Naïve Bayes Algorithm from scratch as well as using tools built into sklearn. 
Packages: Core packages you may need for this assignment include numpy, pandas, sklearn, matplotlib.pyplot and/or seaborn, nltk, string, and re. 
Note: In sklearn, there are different types of Naive Bayes constructors for fitting Naïve Bayes models, dependent on the nature of the data. For example:
MultinomialNB() is used for text classification when data is represented as feature vectors. 
ComplementNB() 

In [3]:
# Question 1:
# Upload the income_evaluation_cat.csv provided on canvas.  The features in this data include workclass, education, race, and gender. The output variable is income and contains two categorical values (<=50k or >50k) indicating whether the income of an individual is less than/equal to $50,000 or greater than $50,000 respectively. Print the unique values of each variable in this data.

income_evaluation_cat = pd.read_csv("income_evaluation_cat.csv", skipinitialspace=True)
# sanitize column names
income_evaluation_cat.columns = income_evaluation_cat.columns.str.lower().str.strip().str.replace(" ", "_")


for col in income_evaluation_cat.columns:
    print("\n", "--" * 10, "column:", col, "--" * 10)
    print(income_evaluation_cat[col].value_counts())



 -------------------- column: workclass --------------------
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

 -------------------- column: education --------------------
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

 -------------------- column: race --------------------
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

 ------

In [4]:
# You will implement Naïve Bayes from scratch using Bayes’ rule. You can do your calculations in Python, but you would not use the sklearn package. Suppose that all the income_evaluation_cat.csv data you uploaded is the training data, classify a test instance, X = [“Private”, “Bachelors”, “White”, “Female”] into the class income<=50 or income>50k. You need to compute the posterior probabilities P(income<=50/X) and P(income>50k/X), then print the class with the greater posterior probability as the predicted class. You don’t need to define a function, but you could if you want.


class NaiveBayesCategorical:
    def __init__(self, data: pd.DataFrame = income_evaluation_cat, target_column: str = "income"):
        self.data = data
        self.target_column = target_column
        self.freq_dict = self.generate_frequency_dictionary()
        self.len_data = len(self.data)
        self.unique_targets = self.data[self.target_column].unique()

    def generate_frequency_dictionary(self):
        freq_dict = {}
        for col in self.data.columns:
            data = pd.crosstab(self.data[col], self.data[self.target_column], margins=True)
            freq_dict[col] = data.to_dict(orient="index")
        return freq_dict

    def predict(self, X: list = None, verbose: bool = False):
        X = {col: X[i] for i, col in enumerate(self.data.drop(columns=self.target_column).columns)}

        probability_dict = {k: 1 for k in self.unique_targets}

        for k, v in X.items():
            count_predictor = self.freq_dict[k][v]["All"]
            probability_predictor = count_predictor / self.len_data

            for target_class in self.unique_targets:

                count_class = self.freq_dict[k]["All"][target_class]
                probability_class = count_class / self.len_data

                count_predictor_in_class = self.freq_dict[k][v][target_class]
                probability_predictor_in_class = count_predictor_in_class / count_class
                # print(
                #     k, v, target_class, probability_predictor, probability_class, probability_predictor_in_class
                # )
                probability_dict[target_class] *= probability_predictor_in_class

        for target_class in probability_dict.keys():
            probability_dict[target_class] *= self.freq_dict[self.target_column]["All"][target_class] / self.len_data

        sum_probs = sum(probability_dict.values())
        for target_class in probability_dict.keys():
            probability_dict[target_class] /= sum_probs

        predicted_class = max(probability_dict, key=probability_dict.get)
        if verbose:
            print(
                " Input:",
                X,
                "\n",
                "Class Probabilities:",
                probability_dict,
                "\n",
                "Predicted Class:",
                predicted_class,
            )
        return predicted_class


prediction_instance = ["Private", "Bachelors", "White", "Female"]

nb_categorical = NaiveBayesCategorical(income_evaluation_cat, "income")
nb_categorical.predict(X=prediction_instance, verbose=True)
nb_categorical.predict(X=["Self-emp-inc", "Doctorate", "White", "Male"], verbose=True)


 Input: {'workclass': 'Private', 'education': 'Bachelors', 'race': 'White', 'gender': 'Female'} 
 Class Probabilities: {'<=50K': 0.7919784735489185, '>50K': 0.20802152645108152} 
 Predicted Class: <=50K
 Input: {'workclass': 'Self-emp-inc', 'education': 'Doctorate', 'race': 'White', 'gender': 'Male'} 
 Class Probabilities: {'<=50K': 0.05529590397879263, '>50K': 0.9447040960212074} 
 Predicted Class: >50K


'>50K'

In [5]:
# Preprocess or transform the features in the income_evaluation_cat.csv data using an appropriate scaler in sklearn. You don’t need to transform the output variable; it should still work fine in a text format.
le = LabelEncoder()

encodings = []


def encode_labels(encoder, X):
    encoder.fit(X)
    encodings.append(list(enumerate(encoder.classes_)))
    return encoder.transform(X)


income_evaluation_cat_encoded = income_evaluation_cat.drop(columns="income").apply(lambda x: encode_labels(le, x))
pp(encodings)
income_evaluation_cat_encoded["income"] = income_evaluation_cat["income"]
income_evaluation_cat_encoded


[[(0, '?'),
  (1, 'Federal-gov'),
  (2, 'Local-gov'),
  (3, 'Never-worked'),
  (4, 'Private'),
  (5, 'Self-emp-inc'),
  (6, 'Self-emp-not-inc'),
  (7, 'State-gov'),
  (8, 'Without-pay')],
 [(0, '10th'),
  (1, '11th'),
  (2, '12th'),
  (3, '1st-4th'),
  (4, '5th-6th'),
  (5, '7th-8th'),
  (6, '9th'),
  (7, 'Assoc-acdm'),
  (8, 'Assoc-voc'),
  (9, 'Bachelors'),
  (10, 'Doctorate'),
  (11, 'HS-grad'),
  (12, 'Masters'),
  (13, 'Preschool'),
  (14, 'Prof-school'),
  (15, 'Some-college')],
 [(0, 'Amer-Indian-Eskimo'),
  (1, 'Asian-Pac-Islander'),
  (2, 'Black'),
  (3, 'Other'),
  (4, 'White')],
 [(0, 'Female'), (1, 'Male')]]


Unnamed: 0,workclass,education,race,gender,income
0,7,9,4,1,<=50K
1,6,9,4,1,<=50K
2,4,11,4,1,<=50K
3,4,1,2,1,<=50K
4,4,9,2,0,<=50K
...,...,...,...,...,...
32556,4,7,4,0,<=50K
32557,4,11,4,1,>50K
32558,4,11,4,0,<=50K
32559,4,11,4,1,<=50K


In [6]:
# Randomly split the transformed input data and the output data into X_train, y_train, X_test and y_test using tools in sklearn.
X_train, X_test, y_train, y_test = train_test_split(
    income_evaluation_cat_encoded.drop(columns=["income"]),
    income_evaluation_cat_encoded["income"],
    test_size=0.2,
    random_state=42,
)


In [7]:
# Use an appropriate Naïve Bayes constructor in sklearn to construct and fit a Naïve Bayes model on the training data, then use the model to compute the accuracy score of the training and test set.
cat_NB = CategoricalNB(fit_prior=True)
cat_NB.fit(X_train, y_train)
print("Training accuracy", cat_NB.score(X_train, y_train))
print("Test accuracy", cat_NB.score(X_test, y_test))


Training accuracy 0.7838221744471745
Test accuracy 0.7865806847842776


# Compare

In [8]:
pp(encodings)
cat_NB.predict_proba([[4, 9, 4, 0]])
cat_NB.predict_proba([[5, 10, 4, 1]])


[[(0, '?'),
  (1, 'Federal-gov'),
  (2, 'Local-gov'),
  (3, 'Never-worked'),
  (4, 'Private'),
  (5, 'Self-emp-inc'),
  (6, 'Self-emp-not-inc'),
  (7, 'State-gov'),
  (8, 'Without-pay')],
 [(0, '10th'),
  (1, '11th'),
  (2, '12th'),
  (3, '1st-4th'),
  (4, '5th-6th'),
  (5, '7th-8th'),
  (6, '9th'),
  (7, 'Assoc-acdm'),
  (8, 'Assoc-voc'),
  (9, 'Bachelors'),
  (10, 'Doctorate'),
  (11, 'HS-grad'),
  (12, 'Masters'),
  (13, 'Preschool'),
  (14, 'Prof-school'),
  (15, 'Some-college')],
 [(0, 'Amer-Indian-Eskimo'),
  (1, 'Asian-Pac-Islander'),
  (2, 'Black'),
  (3, 'Other'),
  (4, 'White')],
 [(0, 'Female'), (1, 'Male')]]




array([[0.05562795, 0.94437205]])

In [9]:
# Question 2
# Upload the income_evaluation_continuous.csv data provided on canvas.  The features in this data include age, education_num, and hours_per_week. The output variable is income and contains two categorical values (<=50k or >50k) indicating whether the income of an individual is less than/equal to $50,000 or greater than $50,000 respectively. Compute the mean and standard deviation of each input variable such that the results are presented on the same table or data frame. You can call the .apply() function on the pandas DataFrame.
income_evaluation_continuous = pd.read_csv("income_evaluation_continuous.csv", skipinitialspace=True)
income_stats = income_evaluation_continuous.agg(["mean", "std"])
display(income_stats)


Unnamed: 0,age,education_num,hours_per_week
mean,38.581647,10.080679,40.437456
std,13.640433,2.57272,12.347429


In [10]:
# You will implement Naïve Bayes from scratch using Bayes’ rule. Assume that all the features or input variables follow a normal distribution. You can do your calculations in Python, but you would not use the sklearn package. You can use the density function inside the stats module in the SciPy package. Given that all the income_evaluation_continuous.csv data you uploaded is the training data , classify a test instance, X = [30, 10, 45], into the class income<=50 or income>50k. You need to compute the posterior probabilities P(income<=50/X) and P(income>50k/X), then print the class with the greater posterior probability as the predicted class. You don’t need to define a function, but you could if you want.


class NaiveBayesContinuous:
    def __init__(
        self,
        data: pd.DataFrame = income_evaluation_continuous,
        target_column: str = "income",
    ):
        self.data = data
        self.target_column = target_column
        self.unique_targets = self.data[self.target_column].unique()

        self.stats_tables = self.calculate_stats()
        self.target_value_likelihoods = self.data.groupby(target_column)[target_column].count() / len(self.data)

    def calculate_stats(self):
        stats_tables = {}
        for target in self.unique_targets:
            stats_tables[target] = self.data[self.data[self.target_column] == target].agg(["mean", "std"]).T
        return stats_tables

    def predict(self, X: list = None, verbose: bool = False):
        probability_dict = {k: 1 for k in self.unique_targets}

        for target, table in self.stats_tables.items():
            table["X"] = X
            table["density"] = table.apply(lambda x: norm.pdf(x["X"], x["mean"], x["std"]), axis=1)

            probability_dict[target] = table["density"].prod() * self.target_value_likelihoods[target]

        sum_probs = sum(probability_dict.values())
        for target_class in probability_dict.keys():
            probability_dict[target_class] /= sum_probs

        predicted_class = max(probability_dict, key=probability_dict.get)
        if verbose:
            print(
                " Input:",
                X,
                "\n",
                "Class Probabilities:",
                probability_dict,
                "\n",
                "Predicted Class:",
                predicted_class,
            )
        return predicted_class


nb_continuous = NaiveBayesContinuous()
display(nb_continuous.stats_tables)
nb_continuous.predict(X=[30, 10, 45], verbose=True)


{'<=50K':                      mean        std
 age             36.783738  14.020088
 education_num    9.595065   2.436147
 hours_per_week  38.840210  12.318995,
 '>50K':                      mean        std
 age             44.249841  10.519028
 education_num   11.611657   2.385129
 hours_per_week  45.473026  11.012971}

 Input: [30, 10, 45] 
 Class Probabilities: {'<=50K': 0.834602773454588, '>50K': 0.16539722654541197} 
 Predicted Class: <=50K


'<=50K'

In [11]:
# Preprocess or transform the features in the income_evaluation_cont.csv data using an appropriate scaler in sklearn. You don’t need to transform the output variable; it should still work fine in a text format.
income_evaluation_continuous


Unnamed: 0,age,education_num,hours_per_week,income
0,39,13,40,<=50K
1,50,13,13,<=50K
2,38,9,40,<=50K
3,53,7,40,<=50K
4,28,13,40,<=50K
...,...,...,...,...
32556,27,12,38,<=50K
32557,40,9,40,>50K
32558,58,9,40,<=50K
32559,22,9,20,<=50K


In [12]:
# Randomly split the input and output data into X_train, y_train, X_test and y_test using tools in sklearn.
X_train, X_test, y_train, y_test = train_test_split(
    income_evaluation_continuous.drop(columns="income"),
    income_evaluation_continuous["income"],
    test_size=0.2,
    random_state=42,
)


In [13]:
# Use an appropriate Naïve Bayes constructor in sklearn to construct and fit a Naïve Bayes model on the training data, then use the model to compute the accuracy score of the training and test set.

gaussian_NB = GaussianNB()
gaussian_NB.fit(X_train, y_train)
print("Training accuracy", gaussian_NB.score(X_train, y_train))
print("Test accuracy", gaussian_NB.score(X_test, y_test))


Training accuracy 0.7988329238329238
Test accuracy 0.8000921234454169


In [14]:
# Question 3:
# You will now implement a Naïve Bayes for text classification to detect fake or true news.
# Upload the True.csv data provided on canvas into Python. You will create a new data frame by selecting the “title” and “text” columns, then, adding a new column called “news_type” where all the values on this new column are “True”. So, your new data frame should have three columns; “title”, “text” and “news_type”.
true_data = pd.read_csv("True.csv", skipinitialspace=True)
true_data = true_data[["title", "text"]]
true_data["news_type"] = "True"
true_data


Unnamed: 0,title,text,news_type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,True
...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,True
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",True
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,True
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,True


In [15]:
# Upload the Fake.csv data provided on canvas into Python. You will create a new data frame by selecting the “title” and “text” columns, then, adding a new column called “news_type” where all the values on this new column are “Fake”. So, your new data frame should have three columns; “title”, “text” and “news_type”.
fake_data = pd.read_csv("Fake.csv", skipinitialspace=True)
fake_data = fake_data[["title", "text"]]
fake_data["news_type"] = "Fake"
fake_data


Unnamed: 0,title,text,news_type
0,Donald Trump Sends Out Embarrassing New Year’s...,Donald Trump just couldn t wish all Americans ...,Fake
1,Drunk Bragging Trump Staffer Started Russian C...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke ...,"On Friday, it was revealed that former Milwauk...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name ...,"On Christmas day, Donald Trump announced that ...",Fake
4,Pope Francis Just Called Out Donald Trump Duri...,Pope Francis used his annual Christmas Day mes...,Fake
...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Fake
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Fake
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Fake
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Fake


In [16]:
# Merge the data frame in a) and b) so that one of the data frames is stacked vertically on top of the other. Combine the text in the “title” and “text” columns of the merged data frame into another column called “news”. Drop the “title” and “text” columns so that your final data frame is has only two columns, “news” and “news_type”. Print the first five rows of your final data frame.
merged_data = pd.concat([true_data, fake_data], ignore_index=True)
merged_data["news"] = merged_data["title"] + " " + merged_data["text"]
merged_data.drop(columns=["title", "text"], inplace=True)
display(merged_data.head())


Unnamed: 0,news_type,news
0,True,"As U.S. budget fight looms, Republicans flip t..."
1,True,U.S. military to accept transgender recruits o...
2,True,Senior U.S. Republican senator: 'Let Mr. Muell...
3,True,FBI Russia probe helped by Australian diplomat...
4,True,Trump wants Postal Service to charge 'much mor...


In [17]:
# Preprocess your data by cleaning the textual data in the “news” column and removing the stop words, special characters, punctuations, etc especially at the beginning and end of each word. You can display any messy news text before you clean the data, then display the messy news text again after cleaning the data to see if your data cleaning worked well. Also, drop instances where the news text is less than 50 words.

table = str.maketrans("", "", string.punctuation)
stop_words = set(stopwords.words("english"))


def clean(text):
    try:
        tokens = word_tokenize(text)
        stripped = [str(w).lower().translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        if len(words) > 50:
            return " ".join(words)
        else:
            return None
    except TypeError:
        return None


In [18]:
print("Raw Text")
print(merged_data["news"][0])

print("Clean Text")
print(clean(merged_data["news"][0]))


Raw Text
As U.S. budget fight looms, Republicans flip their fiscal script WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases

In [19]:
pandarallel.initialize()
merged_data["news"] = merged_data["news"].parallel_apply(clean)
merged_data


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Unnamed: 0,news_type,news
0,True,us budget fight looms republicans flip fiscal ...
1,True,us military accept transgender recruits monday...
2,True,senior us republican senator let mr mueller jo...
3,True,fbi russia probe helped australian diplomat ti...
4,True,trump wants postal service charge much amazon ...
...,...,...
44893,Fake,mcpain john mccain furious iran treated us sai...
44894,Fake,justice yahoo settles email privacy classactio...
44895,Fake,sunnistan us allied safe zone plan take territ...
44896,Fake,blow million al jazeera america finally calls ...


In [20]:
# Transform the input text data into feature vectors where the entries of the feature vectors are term-frequency-inverse-document-frequency. Use the TfidfVectorizer() in sklearn.
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", decode_error="ignore", max_features=100)

merged_data = merged_data[merged_data["news"].notnull()]
news_tfidf = tfidf.fit_transform(merged_data["news"])
news_tfidf.toarray()


array([[0.        , 0.0980796 , 0.        , ..., 0.        , 0.        ,
        0.1279775 ],
       [0.        , 0.52997391, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.11903907, 0.        , ..., 0.        , 0.05109624,
        0.        ],
       ...,
       [0.05670896, 0.05995031, 0.05854585, ..., 0.08964584, 0.02573302,
        0.1564503 ],
       [0.        , 0.        , 0.48992678, ..., 0.        , 0.08613624,
        0.        ],
       [0.27782281, 0.        , 0.        , ..., 0.        , 0.06303432,
        0.        ]])

In [21]:
# Spit the feature vectors and the output variable into into X_train, y_train, X_test and y_test, you can let the test set be 30% of the entire data.

X_train, X_test, y_train, y_test = train_test_split(
    news_tfidf.toarray(), merged_data["news_type"], test_size=0.3, random_state=42
)


In [22]:
# Fit an appropriate Naïve Bayes model and compute the training and test accuracy of the model. Is there overfitting?
multinomial_NB = MultinomialNB()
multinomial_NB.fit(X_train, y_train)
print("Training accuracy", multinomial_NB.score(X_train, y_train))
print("Test accuracy", multinomial_NB.score(X_test, y_test))


Training accuracy 0.9042817679558011
Test accuracy 0.9007412181759588


In [23]:
# Fit a Naïve Bayes using cross validation and print the average cross validation score as well as the standard deviation of the cross-validation scores.
multinomial_nb_cross_val_scores = cross_val_score(
    multinomial_NB, news_tfidf.toarray(), merged_data["news_type"], cv=31
)
print("Average cross validation score", multinomial_nb_cross_val_scores.mean())
print("STD cross validation score", multinomial_nb_cross_val_scores.std())


Average cross validation score 0.8987470744511761
STD cross validation score 0.0552014938734187


In [24]:
# Select some hypermeters of your choice and tune using the grid search cross validation. Use some other hyperparameters than those used in class examples.

multinomial_NB_grid = GridSearchCV(MultinomialNB(), {"alpha": [0.1, 1, 10, 100]}, cv=10, n_jobs=-1)

multinomial_NB_grid_cross_val_scores = cross_val_score(
    multinomial_NB_grid, news_tfidf.toarray(), merged_data["news_type"], cv=10, n_jobs=-1
)
print("Average cross validation score", multinomial_NB_grid_cross_val_scores.mean())
print("STD cross validation score", multinomial_NB_grid_cross_val_scores.std())


Average cross validation score 0.8907206920816085
STD cross validation score 0.04920878104823473


In [34]:
# 4) Mini Project
# Find some text data of your own choice, it could be labelled tweets, etc.
# Your dataset should have at least 200 instances, and if there are several columns of text, you can choose to merge the text columns into a single text column. Each text instance should have at least 60 words.
tweets_data = pd.read_csv("Tweets.csv", skipinitialspace=True)
print(tweets_data.info())
tweets_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [40]:
# 4) Mini Project
# Find some text data of your own choice, it could be labelled tweets, etc.
# Your dataset should have at least 200 instances, and if there are several columns of text, you can choose to merge the text columns into a single text column. Each text instance should have at least 60 words.
tweets_data = pd.read_csv("Tweets.csv", skipinitialspace=True)
display(tweets_data.head())

# Clean the data, split the data, transform the data to a representation suitable for your algorithm, build your model and evaluate the model.
tweets_data = tweets_data[["airline_sentiment", "airline", "negativereason", "text", "tweet_location", "user_timezone"]]
display(tweets_data.info())
tweets_data["review"] = (
    tweets_data["airline"].astype(str)
    + " " 
    + tweets_data["negativereason"].astype(str)
    + " " 
    + tweets_data["text"].astype(str)
    + " "
    + tweets_data["tweet_location"].astype(str)
    + " "
    + tweets_data["user_timezone"].astype(str)
)
tweets_data["review"] = tweets_data["review"].str.replace("nan", "")


def clean_tweet(text):
    try:
        tokens = word_tokenize(text)
        stripped = [str(w).lower().translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        return " ".join(words)
    except TypeError:
        return None


tweets_data["review"] = tweets_data["review"].parallel_apply(clean_tweet)
tweets_data.drop(columns=["negativereason", "text","airline", "tweet_location", "user_timezone"], inplace=True)
tweets_data

tweet_tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", decode_error="ignore", max_features=100)
tweet_tfidf_matrix = tweet_tfidf.fit_transform(tweets_data["review"])

X_tweet_train, X_tweet_test, y_tweet_train, y_tweet_test = train_test_split(
    tweet_tfidf_matrix.toarray(), tweets_data["airline_sentiment"], test_size=0.3, random_state=42
)

# Tune some parameters of interest
multinomial_NB_tweet = GridSearchCV(MultinomialNB(), {"alpha": [0.1, 1, 10, 100]}, cv=10, n_jobs=-1)
multinomial_NB_tweet.fit(X_tweet_train, y_tweet_train)

print("Training accuracy", multinomial_NB_tweet.score(X_tweet_train, y_tweet_train))
print("Test accuracy", multinomial_NB_tweet.score(X_tweet_test, y_tweet_test))


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   airline_sentiment  14640 non-null  object
 1   airline            14640 non-null  object
 2   negativereason     9178 non-null   object
 3   text               14640 non-null  object
 4   tweet_location     9907 non-null   object
 5   user_timezone      9820 non-null   object
dtypes: object(6)
memory usage: 686.4+ KB


None

Training accuracy 0.89032006245121
Test accuracy 0.8879781420765027


In [29]:
multinomial_NB_tweet.best_params_

{'alpha': 0.1}