In [146]:
import sys
import csv
import math
import random
import pandas as pd
import numpy as np
import nltk
import sklearn as sk
import matplotlib.pyplot as plt
import sklearn.model_selection as model_select
import sklearn.tree as tree
import sklearn.metrics as metrics
# Part 3: Mining text data.

# Return a pandas dataframe containing the data set.
# Specify a 'latin-1' encoding when reading the data.
# data_file will be populated with the string 'wholesale_customers.csv'.
def read_csv_3(data_file):
    DATA_DIR  = './data/'
    DATA_FILE = data_file
    try:
        rawdata = pd.read_csv(DATA_DIR + DATA_FILE, encoding = 'latin-1')
    except IOError as iox:
        print('there was an I/O error trying to open the data file: ' + str( iox ))
        sys.exit()
    return rawdata

# Return a list with the possible sentiments that a tweet might have.
def get_sentiments(df):
    sentis = df["Sentiment"].unique()
    sentis = list(sentis)
    return sentis

# Return a string containing the second most popular sentiment among the tweets.
def second_most_popular_sentiment(df):
    return  df["Sentiment"].value_counts().index[1]

# Return the date (string as it appears in the data) with the greatest number of extremely positive tweets.
def date_most_popular_tweets(df):
    res = df.loc[df["Sentiment"] == 'Extremely Positive']
    date = list(res["TweetAt"])
    return date[len(date)-1]

# Modify the dataframe df by converting all tweets to lower case. 
def lower_case(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.lower()
    return df

# Modify the dataframe df by replacing each characters which is not alphabetic or whitespace with a whitespace.
def remove_non_alphabetic_chars(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.replace('[^a-zA-Z]', ' ', regex=True)


# Modify the dataframe df with tweets after removing characters which are not alphabetic or whitespaces.
def remove_multiple_consecutive_whitespaces(df):
    df['OriginalTweet'] = df['OriginalTweet'].str.replace('\s+', ' ', regex=True)
    
# Given a dataframe where each tweet is one string with words separated by single whitespaces,
# tokenize every tweet by converting it into a list of words (strings).
def tokenize(df):
    df['OriginalTweet'] = df.apply(lambda row: nltk.word_tokenize(row['OriginalTweet']), axis=1)

In [147]:
df = read_csv_3("coronavirus_tweets.csv")
sen = get_sentiments(df)

sec = second_most_popular_sentiment(df)


data_most = date_most_popular_tweets(df)


lower_case(df)
# print(df)
remove_non_alphabetic_chars(df)
remove_multiple_consecutive_whitespaces(df)

tokenize(df)
df


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,"[menyrbie, phil, gahan, chrisitv, https, t, co...",Neutral
1,3800,48752,UK,16-03-2020,"[advice, talk, to, your, neighbours, family, t...",Positive
2,3801,48753,Vagabonds,16-03-2020,"[coronavirus, australia, woolworths, to, give,...",Positive
3,3802,48754,,16-03-2020,"[my, food, stock, is, not, the, only, one, whi...",Positive
4,3803,48755,,16-03-2020,"[me, ready, to, go, at, supermarket, during, t...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,"[airline, pilots, offering, to, stock, superma...",Neutral
41153,44952,89904,,14-04-2020,"[response, to, complaint, not, provided, citin...",Extremely Negative
41154,44953,89905,,14-04-2020,"[you, know, it, s, getting, tough, when, kamer...",Positive
41155,44954,89906,,14-04-2020,"[is, it, wrong, that, the, smell, of, hand, sa...",Neutral
