In [247]:
import sys
import csv
import math
import random
import requests
import pandas as pd
import numpy as np
import sklearn as sk
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import sklearn.model_selection as model_select
import sklearn.tree as tree
import sklearn.metrics as metrics
import sklearn.naive_bayes as nb
from sklearn.feature_extraction.text import CountVectorizer
# Part 3: Mining text data.

# Return a pandas dataframe containing the data set.
# Specify a 'latin-1' encoding when reading the data.
# data_file will be populated with the string 'wholesale_customers.csv'.
def read_csv_3(data_file):
    DATA_DIR  = './data/'
    DATA_FILE = data_file
    try:
        rawdata = pd.read_csv(DATA_DIR + DATA_FILE, encoding = 'latin-1')
    except IOError as iox:
        print('there was an I/O error trying to open the data file: ' + str( iox ))
        sys.exit()
    return rawdata

# Return a list with the possible sentiments that a tweet might have.
def get_sentiments(df):
    sentis = df["Sentiment"].unique()
    sentis = list(sentis)
    return sentis

# Return a string containing the second most popular sentiment among the tweets.
def second_most_popular_sentiment(df):
    return  df["Sentiment"].value_counts().index[1]

# Return the date (string as it appears in the data) with the greatest number of extremely positive tweets.
def date_most_popular_tweets(df):
    res = df.loc[df["Sentiment"] == 'Extremely Positive']
    date = list(res["TweetAt"])
    return date[len(date)-1]

# Modify the dataframe df by converting all tweets to lower case. 
def lower_case(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.lower()
    return df

# Modify the dataframe df by replacing each characters which is not alphabetic or whitespace with a whitespace.
def remove_non_alphabetic_chars(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.replace('[^a-zA-Z]', ' ', regex=True)


# Modify the dataframe df with tweets after removing characters which are not alphabetic or whitespaces.
def remove_multiple_consecutive_whitespaces(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.replace('\s+', ' ', regex=True)
    
# Given a dataframe where each tweet is one string with words separated by single whitespaces,
# tokenize every tweet by converting it into a list of words (strings).
def tokenize(df):
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda row: [word for word in row.split()])
    
    
# Given dataframe tdf with the tweets tokenized, return the number of words in all tweets including repetitions.
def count_words_with_repetitions(tdf):
    a = list(df["OriginalTweet"])
    a = [col for row in a for col in row ]
    count = len(a)
    return count

# Given dataframe tdf with the tweets tokenized, return the number of distinct words in all tweets.
def count_words_without_repetitions(tdf):
    a = list(df["OriginalTweet"])
    a = [col for row in a for col in row ]
    a = set(a)
    count = len(a)
    return count

# Given dataframe tdf with the tweets tokenized, return a list with the k distinct words that are most frequent in the tweets.
def frequent_words(tdf,k):
    a = list(df["OriginalTweet"])
    a = [col for row in a for col in row ]
    freq = list(pd.value_counts(a).head(k).index)
    return freq

# Given dataframe tdf with the tweets tokenized, remove stop words and words with <=2 characters from each tweet.
# The function should download the list of stop words via:
# https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt
def remove_stop_words(tdf):
    #get the stopwords
    stopwords = requests.get("https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt" ).content.decode("UTF-8").split("\n")
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x: [word for word in x if (len(word)>2 and word not in stopword)])
    
# Given dataframe tdf with the tweets tokenized, reduce each word in every tweet to its stem.
def stemming(tdf):
    stemmer = PorterStemmer()
    df['OriginalTweet'] = df['OriginalTweet'].apply(lambda row: [stemmer.stem(word) for word in row])

# Given a pandas dataframe df with the original coronavirus_tweets.csv data set,
# build a Multinomial Naive Bayes classifier. 
# Return predicted sentiments (e.g. 'Neutral', 'Positive') for the training set
# as a 1d array (numpy.ndarray). 
def mnb_predict(df):
    lower_case(df)
    remove_non_alphabetic_chars(df)
    remove_multiple_consecutive_whitespaces(df)
    tokenize(df)
    remove_stop_words(df)
    stemming(df)
    
    a = list(df["OriginalTweet"])
    words = [col for row in a for col in row ]
    
    cv = CountVectorizer(min_df=5)
    cv_fit = cv.fit_transform(words)
    # cv.vocabulary_
    terms = list(cv.vocabulary_)
    termdoc = [[0 for j in range(len(terms))]  for i in range(len(df))]
    for i in range(len(df)):
        for j in range(len(terms)):
            if(terms[j] in  df["OriginalTweet"][i]):
                termdoc[i][j] = cv.vocabulary_[terms[j]] 
    label = list(df["Sentiment"])
    clf = nb.MultinomialNB()
    clf.fit(termdoc,label)
    
    y_hat = clf.predict(termdoc)
    return y_hat
# Given a 1d array (numpy.ndarray) y_pred with predicted labels (e.g. 'Neutral', 'Positive') 
# by a classifier and another 1d array y_true with the true labels, 
# return the classification accuracy rounded in the 3rd decimal digit.
def mnb_accuracy(y_pred,y_true):
    count = 0
    for i in range(len(y_pred)):
        if (y_true[i] == y_pred[i]):
            count += 1
    acc = count / len(y_pred)
    acc = round(acc,3)
    return acc

In [248]:
df = read_csv_3("coronavirus_tweets.csv")
y_true = np.array(df["Sentiment"])
y_pred = mnb_predict(df)
mnb_accuracy(y_pred,y_true)
#3 1500 0.47
#5 1500 0.474
#10 1500 0.473
#5 3000 0.51
#5 2000 0.487
#5 max 0.58

0.58

In [118]:
df = read_csv_3("coronavirus_tweets.csv")
sen = get_sentiments(df)

sec = second_most_popular_sentiment(df)


data_most = date_most_popular_tweets(df)


# lower_case(df)
# # print(df)
# remove_non_alphabetic_chars(df)
# remove_multiple_consecutive_whitespaces(df)

# tokenize(df)
mnb_predict(df)


In [161]:
df
# len(df)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,"[menyrbi, phil, gahan, chrisitv, http, ifz, fa...",Neutral
1,3800,48752,UK,16-03-2020,"[advic, talk, neighbour, famili, exchang, phon...",Positive
2,3801,48753,Vagabonds,16-03-2020,"[coronaviru, australia, woolworth, give, elder...",Positive
3,3802,48754,,16-03-2020,"[food, stock, empti, don, panic, food, stay, c...",Positive
4,3803,48755,,16-03-2020,"[readi, supermarket, covid, outbreak, paranoid...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,"[airlin, pilot, offer, stock, supermarket, she...",Neutral
41153,44952,89904,,14-04-2020,"[respons, complaint, provid, cite, covid, rela...",Extremely Negative
41154,44953,89905,,14-04-2020,"[tough, kameronwild, ration, toilet, paper, co...",Positive
41155,44954,89906,,14-04-2020,"[wrong, smell, hand, sanit, start, turn, coron...",Neutral


In [208]:
a = list(df["OriginalTweet"])
words = [col for row in a for col in row ]

cv = CountVectorizer(min_df=5,max_features=500)
cv_fit = cv.fit_transform(words)
# cv.vocabulary_
termdoc = [[0 for j in range(len(cv.vocabulary_))]  for i in range(len(df))]
terms = list(cv.vocabulary_)

# list(cv_fit)
# cv.toarray()

In [209]:
for i in range(len(df)):
    for j in range(len(terms)):
        if(terms[j] in  df["OriginalTweet"][i]):
            termdoc[i][j] = cv.vocabulary_[terms[j]] 
len(termdoc)



41157

In [210]:
label = list(df["Sentiment"])
clf = nb.MultinomialNB()
clf.fit(termdoc,label)


MultinomialNB()

In [219]:
y_hat = clf.predict(termdoc)
type(label)
count = 0
for i in range(len(label)):
    if (label[i] == y_hat[i]):
        count += 1
count / len(df)

0.4057875938479481

In [164]:
text = df['OriginalTweet'][0]+df["OriginalTweet"][1]

print(text)
text = np.array(text)
print(text)
print(type(text))
cv = CountVectorizer(min_df= 2,max_features=3)
cv_fit = cv.fit_transform(text)
print("dic is: "+str(cv.vocabulary_))
print("dic is: "+ str(len(cv.vocabulary_)))
print(cv_fit.toarray())
print(cv_fit)

['menyrbi', 'phil', 'gahan', 'chrisitv', 'http', 'ifz', 'fan', 'http', 'ghgfzcc', 'http', 'nlzdxno', 'advic', 'talk', 'neighbour', 'famili', 'exchang', 'phone', 'number', 'creat', 'contact', 'list', 'phone', 'number', 'neighbour', 'school', 'employ', 'chemist', 'set', 'onlin', 'shop', 'account', 'poss', 'adequ', 'suppli', 'regular', 'med', 'order']
['menyrbi' 'phil' 'gahan' 'chrisitv' 'http' 'ifz' 'fan' 'http' 'ghgfzcc'
 'http' 'nlzdxno' 'advic' 'talk' 'neighbour' 'famili' 'exchang' 'phone'
 'number' 'creat' 'contact' 'list' 'phone' 'number' 'neighbour' 'school'
 'employ' 'chemist' 'set' 'onlin' 'shop' 'account' 'poss' 'adequ' 'suppli'
 'regular' 'med' 'order']
<class 'numpy.ndarray'>
dic is: {'http': 0, 'neighbour': 1, 'number': 2}


TypeError: object of type 'NoneType' has no len()

In [218]:
a = ['a','b','c']
b = ['a',2,3]
b = np.array(b)
for i in range(3):
    if(a[i] == b[i]):
        print(a[i])

a


In [223]:
a = 3.234487293
round(a,3)

3.234