In [2]:
import numpy as np
import pandas as pd
import math
import copy
import random
import csv
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
import operator
import json
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, EmotionOptions, SentimentOptions, EntitiesOptions
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neural_network import MLPClassifier

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ajinkya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
df = pd.read_csv('tweets_train.csv',delimiter=',')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 7 columns):
Date               459 non-null object
Tweet Full Text    459 non-null object
Topic              459 non-null object
Action             459 non-null object
user id            356 non-null float64
follower count     356 non-null float64
place              0 non-null float64
dtypes: float64(3), object(4)
memory usage: 25.2+ KB
None


# Topic Modeling

In [13]:
topics = df['Topic'].unique()
print(df['Topic'].value_counts())

Community Action    103
Traffic              95
Appreciation         78
Suggestion           51
Others               24
Follow up            23
Query                22
Dissatisfaction      21
Fraud                17
General Info         13
Theft                 6
Fake News             5
Missing Person        1
Name: Topic, dtype: int64


In [14]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]
lemmatizer = WordNetLemmatizer()

In [15]:
# text = "i am Good Good good person".split()
# text = lemmatize_text(text)
# vectorizer = CountVectorizer()
# print(vectorizer.fit_transform(text).toarray().sum(axis=0))
# print(vectorizer.vocabulary_)

# Generating TfiDF Table

In [16]:
text = df['Tweet Full Text'].str.cat(sep=' ').lower().replace('thanks','thank').split()
text = lemmatize_text(text)
vectorizer = CountVectorizer()
vectorizer.fit_transform(text)
# print(vectorizer.vocabulary_.keys())
table = pd.DataFrame(index=topics,columns=vectorizer.vocabulary_.keys())
table = table.fillna(0)

In [18]:
# for i in list(vectorizer.vocabulary_.keys()):
#     if(i=='thanks'):
#         print(i)

# Table Entries

In [36]:
for x in topics:
    data = df[df['Topic']==x]
    text = data['Tweet Full Text'].str.cat(sep=' ').lower().replace('thanks','thank').split()
    text = lemmatize_text(text)
#     print(text)
    vectorizer = CountVectorizer()
    count = vectorizer.fit_transform(text).toarray().sum(axis=0)
#     print(x)
#     print(count)
    name = list(vectorizer.vocabulary_.keys())
#     print(x)
    for i in range(len(name)):
#         if(name[i]=='thank'):
#             print(name[i],count[i],vectorizer.vocabulary_['thank'],count[vectorizer.vocabulary_['thank']])
        table.loc[x][name[i]] += count[vectorizer.vocabulary_[name[i]]]
table = table/table.sum()
for c in table:
    table[c] = table[c] * math.log( len(table)/len(table[table[c]!=0]))
table.head()

Unnamed: 0,zomatocare,hydcitypolice,cyberabadpolice,vijaygopal_,nzomato,accepted,my,order,and,never,...,jio,hqs,knew,ignored,appropriate,cscpcmxoie,raokavitha,needed,deploy,ibtdrbyxeg
Others,2.564949,0.0,0.012076,0.733169,2.564949,2.564949,0.017585,0.293267,0.0,0.293267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Follow up,0.0,0.0,0.004025,0.366584,0.0,0.0,0.017585,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Traffic,0.0,0.0,0.020127,0.366584,0.0,0.0,0.004396,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Appreciation,0.0,0.0,0.038241,0.0,0.0,0.0,0.017585,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fraud,0.0,0.0,0.002013,0.0,0.0,0.0,0.013188,0.0,0.0,0.0,...,2.564949,2.564949,2.564949,2.564949,2.564949,2.564949,0.0,0.0,0.0,0.0


In [20]:
table.to_csv('table.csv', sep=',', encoding='utf-8')

In [22]:
print("Training Completed")

Training Completed


# Predicting Topics

In [23]:
def predict(text):
    global table
    text = text.lower().replace('thanks','thank').replace("'",'').replace('"','').replace('!','').replace('.',' ').split()
    text = lemmatize_text(text)
    tp = {}
    for i in text:
        try:
                try:
                    tp[table[i].idxmax()] += max(table[i])
#                     print(i,table[i].idxmax(),max(table[i]))
                except:
                    tp[table[i].idxmax()] = max(table[i])
#                     print(i,table[i].idxmax(),max(table[i]))
        except:
            a='f'
#     print(tp,text)
    if(bool(tp) == False):
        return 'Others'
    return  max(tp.items(), key=operator.itemgetter(1))[0]

In [35]:
predict("b'@TelanganaDGP @cpnizamabad @CPHydCity @hydcitypolice @sp_kamareddy @spkamareddy Thank you so much sir For Your Response \xf0\x9f\x99\x8f @TelanganaDGP'")

'Appreciation'

# Predicting Topics

In [32]:
df['predictions'] = df['Tweet Full Text'].apply(predict)
df["classification_correct"] = df["predictions"] == df["Topic"]
df['classification_correct'].mean()

0.9281045751633987

# Sentiment , Emotion, Entitiy

In [26]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2018-11-16',
    iam_apikey='TQHgzc_QZ5w9hjhnYdFkWN0lrJk4AaXB4PkJXltCL-fA',
    url='https://gateway-lon.watsonplatform.net/natural-language-understanding/api'
)

In [28]:
# response = natural_language_understanding.analyze(
#     text="b'@HYDTP @hydcitypolice @AddlCPTrHyd Good initiative'",
#     features=Features(sentiment=SentimentOptions(),emotion=EmotionOptions(),entities=EntitiesOptions())).get_result()
# print(json.loads(json.dumps(response['emotion']['document']['emotion'])))
# print(json.loads(json.dumps(response['sentiment']['document']['score'])))
# json.loads(json.dumps(response['entities'][0]['type']))=='Location'

In [30]:
score = []
mydict = {}
for i in df['Tweet Full Text']:
    response2 = natural_language_understanding.analyze(
        text=i,
        features=Features(sentiment=SentimentOptions()),
        language='en'
    ).get_result()
    mydict['score'] = json.loads(json.dumps(response2['sentiment']['document']['score']))  
    try:
        score.append(mydict['score'])
    except:
        score.append(0)
df['score'] = score
df.head()

Unnamed: 0,Date,Tweet Full Text,Topic,Action,user id,follower count,place,score
0,2/25/19 6:34,b'@zomatocare @hydcitypolice \n@cyberabadpolic...,Others,NS,1.08e+18,0.0,,0.339199
1,2/25/19 5:15,"b'@hydcitypolice Sir, any updates regarding my...",Follow up,S,1.04e+18,0.0,,-0.761524
2,2/24/19 5:59,b'@HYDTP @CPHydCity @hydcitypolice @TelanganaD...,Traffic,NS,1.09e+18,0.0,,0.0
3,03-01-2019 14:32,b'@HYDTP @hydcitypolice @AddlCPTrHyd Good init...,Appreciation,NS,140010800.0,0.0,,0.901576
4,03-01-2019 12:34,b'@USCGHyderabad @TelanganaDGP @USAndHyderabad...,Appreciation,NS,1.05e+18,0.0,,0.965338


In [31]:
sadness,joy,fear,disgust,anger = [],[],[],[],[]
mydict = {}
for i in df['Tweet Full Text']:
    response2 = natural_language_understanding.analyze(
        text=i,
        features=Features(emotion=EmotionOptions()),
        language='en'
    ).get_result()
    mydict = json.loads(json.dumps(response2['emotion']['document']['emotion']))
    sadness.append(mydict['sadness'])
    joy.append(mydict['joy'])
    fear.append(mydict['fear'])
    disgust.append(mydict['disgust'])
    anger.append(mydict['anger'])
df['sadness'] = sadness
df['joy'] = joy
df['fear'] = fear
df['disgust'] = disgust
df['anger'] = anger
df.head()

Unnamed: 0,Date,Tweet Full Text,Topic,Action,user id,follower count,place,score,sadness,joy,fear,disgust,anger
0,2/25/19 6:34,b'@zomatocare @hydcitypolice \n@cyberabadpolic...,Others,NS,1.08e+18,0.0,,0.339199,0.105245,0.225655,0.001491,0.002787,0.0233
1,2/25/19 5:15,"b'@hydcitypolice Sir, any updates regarding my...",Follow up,S,1.04e+18,0.0,,-0.761524,0.162827,0.033009,0.013693,0.125967,0.187487
2,2/24/19 5:59,b'@HYDTP @CPHydCity @hydcitypolice @TelanganaD...,Traffic,NS,1.09e+18,0.0,,0.0,0.14188,0.146321,0.088611,0.105356,0.080799
3,03-01-2019 14:32,b'@HYDTP @hydcitypolice @AddlCPTrHyd Good init...,Appreciation,NS,140010800.0,0.0,,0.901576,0.054764,0.780943,0.050431,0.014236,0.026168
4,03-01-2019 12:34,b'@USCGHyderabad @TelanganaDGP @USAndHyderabad...,Appreciation,NS,1.05e+18,0.0,,0.965338,0.002762,0.835799,0.001891,0.001063,0.012618


In [33]:
location = []
mydict = {}
for i in df['Tweet Full Text']:
    response2 = natural_language_understanding.analyze(
        text=i,
        features=Features(entities=EntitiesOptions()),
        language='en'
    ).get_result()
    try:
        mydict['location'] = (json.loads(json.dumps(response2['entities'][0]['type'])) == 'Location')
    except:
        mydict['location'] = 'False'
    location.append(mydict['location'])
df['location'] = location
df.head()

Unnamed: 0,Date,Tweet Full Text,Topic,Action,user id,follower count,place,score,sadness,joy,fear,disgust,anger,predictions,classification_correct,location
0,2/25/19 6:34,b'@zomatocare @hydcitypolice \n@cyberabadpolic...,Others,NS,1.08e+18,0.0,,0.339199,0.105245,0.225655,0.001491,0.002787,0.0233,Others,True,False
1,2/25/19 5:15,"b'@hydcitypolice Sir, any updates regarding my...",Follow up,S,1.04e+18,0.0,,-0.761524,0.162827,0.033009,0.013693,0.125967,0.187487,Follow up,True,False
2,2/24/19 5:59,b'@HYDTP @CPHydCity @hydcitypolice @TelanganaD...,Traffic,NS,1.09e+18,0.0,,0.0,0.14188,0.146321,0.088611,0.105356,0.080799,Traffic,True,False
3,03-01-2019 14:32,b'@HYDTP @hydcitypolice @AddlCPTrHyd Good init...,Appreciation,NS,140010800.0,0.0,,0.901576,0.054764,0.780943,0.050431,0.014236,0.026168,Appreciation,True,False
4,03-01-2019 12:34,b'@USCGHyderabad @TelanganaDGP @USAndHyderabad...,Appreciation,NS,1.05e+18,0.0,,0.965338,0.002762,0.835799,0.001891,0.001063,0.012618,Appreciation,True,False


# Saving Train data and Table data (tfidf)

In [34]:
df.to_csv('train_df.csv', sep=',', encoding='utf-8')
table.to_csv('table.csv', sep=',', encoding='utf-8')