In [1]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
#Load data
import pandas as pd
import numpy as np
import re
import nltk
df = pd.read_csv('C:\\Ashish\\Project\\dataset\\trainset1.csv')#,nrows = 1500)
df.text[:5]

0    #weather Another #fakenews video @CNN Anderson...
1    Staring Down Hurricane Florence via NASA https...
2    #hurricane #florence http:// tra.one/tcphurFlo...
3    Hurricane Florence is still moving through the...
4    Houston is collecting donations for Hurricane ...
Name: text, dtype: object

In [3]:
df.category.value_counts()

1    1009
2     588
3     307
4     118
Name: category, dtype: int64

In [4]:
#Remove Tweeter Handles
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

# remove twitter handles (@user)
df['text'] = np.vectorize(remove_pattern)(df['text'], "@[\w]*")

In [5]:
#Remove hyperlinks and hashtags
for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    tweet = tweet.lower()
    tweet = re.sub(r"\S+\/.\S+ *\S+|.\S+html|\S+-\S+|\d*\/\d+|\d+|\S+%\S+|\S+:\S*|\S+=\S+|.#\S+", "", tweet)
    tweet = tweet.encode("ascii", errors="ignore").decode()
    df.iloc[i,df.columns.get_loc('text')] = tweet

In [6]:
#Remove Stop Words
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))
for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    word_tokens = word_tokenize(tweet) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    tweet = ' '.join(filtered_sentence)
    df.iloc[i,df.columns.get_loc('text')] = tweet

In [7]:
#Remove Custom Words
custom_words = ['via','rt','fav','…','am','et','pm','n\'t','y\'all']
for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    querywords = tweet.split()
    resultwords  = [word for word in querywords if word.lower() not in custom_words]
    result = ' '.join(resultwords)
    df.iloc[i,df.columns.get_loc('text')] = result

In [8]:
#Remove punctuation
import string
remove = string.punctuation + ".‘’\''“”°…-—––•・®.:#"
for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    tweet = ' '.join(word.strip(remove) for word in tweet.split())
    tweet = tweet.strip()
    df.iloc[i,df.columns.get_loc('text')] = tweet

In [9]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    tweet =([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(tweet)])
    tweet = ' '.join(tweet)
    df.iloc[i,df.columns.get_loc('text')] = tweet

In [10]:
# Remove words with 2 or less characters
import nltk
words = set(nltk.corpus.words.words())
for i in range(len(df)):
    tweet = df.iloc[i,df.columns.get_loc('text')]
    tweet = ' '.join(word for word in tweet.split() if len(word)>3)
    df.iloc[i,df.columns.get_loc('text')] = tweet

In [11]:
#Remove tweets with less than 3 words
count = df['text'].str.split().str.len()
df = df[~(count < 2)]

In [12]:
#To view tweets
# for index, row in df.iterrows():
#     print(row.text)

In [13]:
#----------END--OF--PRE-PROCESSING----------#

In [14]:
#

In [15]:
#----------END--OF--CLASSIFICATION----------#

In [16]:
#Init API Keys
import plotly
plotly.tools.set_credentials_file(username='ashishmalpani706', api_key='xwX1e7cYSijpAEpyEFQD')

In [17]:
#Extract Day and save it to main data
from datetime import datetime
day = []
for index, row in df.iterrows():
    day.append(datetime.strptime(row.date, '%d-%m-%y %H:%M').day)
df['day'] = day

In [18]:
#Generate data for traces of plot
grouped_data = df.groupby(["day", "category"])['date'].count()

x_axis = grouped_data.index.levels[0]

data = [grouped_data[point] for point in x_axis]

number_of_categories = len(df.category.unique())

t = [None]*number_of_categories

for trace_number in range(number_of_categories):
    t[trace_number] = []
    for i in range(len(data)):
        if trace_number + 1 in data[i]:
            t[trace_number].append(data[i][trace_number + 1])
        else:
            t[trace_number].append(0)

In [19]:
#Generate traces
import plotly.plotly as py
import plotly.graph_objs as go
import turtle
import random
# Make a list of colors to pick from. Add more if more categories are present
colors  = ["blue","green","red","purple","yellow","pink","orange"]
trace = [None]*number_of_categories
for trace_number in range(number_of_categories):
    trace[trace_number] = dict(
    x = x_axis,
    y = t[trace_number],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5,
              color = colors[trace_number]),
    stackgroup='one')
plotly_data = [go.Scatter(single_trace) for single_trace in trace]
url = py.plot({'data': plotly_data},
               filename = 'widget chart', auto_open=False)
print(url)

https://plot.ly/~ashishmalpani706/43


In [20]:
#Generate table for plot
trace = go.Table(
    header=dict(values=['Date', 'Category', 'Text'],
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[df.date, df.category, df.text],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5))

# data = [trace] 
# py.iplot(data, filename = 'pandas_table')

url2 = py.plot({'data': [go.Table(trace)]},
               filename = 'widget table', auto_open=False)
print(url2)

https://plot.ly/~ashishmalpani706/45


In [24]:
#Display plot and table
from plotly.widgets import GraphWidget
g = GraphWidget(url)
g2 = GraphWidget(url2)
display(g)
display(g2)

GraphWidget()

GraphWidget()

In [25]:
import math
def message_handler(widget, msg):
    start = math.floor(msg['x'][0])
    end = math.floor(msg['x'][1])
    
    analysis_data = dict(values=[])

    temp = []
    for index, row in df.iterrows():
        if row.day >= start and row.day <= end:
            temp.append(row.date)

    temp = pd.Series(temp)
    analysis_data['values'].append([temp])

    temp = []
    for index, row in df.iterrows():
        if row.day >= start and row.day <= end:
            temp.append(row.category)

    temp = pd.Series(temp)
    analysis_data['values'].append([temp])

    temp = []
    for index, row in df.iterrows():
        if row.day >= start and row.day <= end:
            temp.append(row.text)

    temp = pd.Series(temp)
    analysis_data['values'].append([temp])
    
    list_of_lists = [mylist for mylist in analysis_data['values']]
    flattened_list = [y for x in list_of_lists for y in x]
    
    analysis_data['values'] = flattened_list
    g2.restyle({'cells':analysis_data})

g.on_zoom(message_handler)