In [1]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# Task 1
# Training Data
train_data_jsonl = open('train.data.jsonl', 'r')
train_label = open('train.label.json', 'r')

# Dev Data
dev_data_jsonl = open('dev.data.jsonl', 'r')
dev_label = open('dev.label.json', 'r')

# Test Data
test_data_jsonl = open('test.data.jsonl', 'r')


# Task 2 Covid Data
covid_data_jsonl = open('covid.data.jsonl', 'r')

In [3]:
# Tweet ID (Key)
# USer ID
# Follower Count
# Text
# Time Posted
# Parent Tweets ID (If Any)
# Child tweets ID (If any)


# Find BERT tokenize max size
bert_token_list = []

tweet_dict = {}

train_label_json = json.load(train_label)
dev_label_json = json.load(dev_label)

# Make label 1 or 0
# 0 for non-rumour
def binary_label(label):
    if label == 'non-rumour':
        return 0
    else:
        return 1

def jsonl_to_list(jsonl, labels):
    out_list = []
    responses_list = []
    reply_list =[]
    
    for line in jsonl:
        data = json.loads(line)
        responses = 0

        for tweet_data in data:
            
            tweet_id = tweet_data['id']
            user_id = tweet_data['user']['id']
            follower_count = tweet_data['user']['followers_count']
            
            
            text = tweet_data['text']            
            time = tweet_data['created_at']
            parent = tweet_data['in_reply_to_status_id']


            tweet_dict[tweet_id] = {'user_id' : user_id,
                                   'follower_count' : follower_count,
                                   'text' : text,
                                   'time': time,
                                   'parent': parent}
            
            
            # get label and conver to 0 or 1
            
            try:
                label = binary_label(labels[str(tweet_id)])
                out_list.append([tweet_id, text, time, parent, label, follower_count, user_id])
            except KeyError:
                responses += 1
                reply_list.append([tweet_id, text, time, parent, follower_count, user_id])
                continue
                
        
        responses_list.append(responses)
    return out_list, responses_list, reply_list



# test data
def no_label_json(jsonl):
    output = []
    for line in jsonl:

        data = json.loads(line)
        
        for tweet_data in data:

            tweet_id = tweet_data['id']
            user_id = tweet_data['user']['id']
            follower_count = tweet_data['user']['followers_count']
            text = tweet_data['text']
            time = tweet_data['created_at']
            parent = tweet_data['in_reply_to_status_id']


            tweet_dict[tweet_id] = {'user_id' : user_id,
                                   'follower_count' : follower_count,
                                   'text' : text,
                                   'time': time,
                                   'parent': parent}

            output.append([tweet_id, text, time, parent, follower_count, user_id])
    return output
    
train_list, train_responses, train_reply = jsonl_to_list(train_data_jsonl, train_label_json)
dev_list, dev_responses, dev_reply = jsonl_to_list(dev_data_jsonl, dev_label_json)



test_list = no_label_json(test_data_jsonl)
covid_list = no_label_json(covid_data_jsonl)

In [4]:
# Make Dataframes

train_df = DataFrame(train_list, columns=['tweet_id', 'text', 'time', 'parent', 'label', 'follower_count', 'user_id'])
train_df['responses'] = np.asarray(train_responses)

train_reply_df = DataFrame(train_reply, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])


dev_df = DataFrame(dev_list, columns=['tweet_id', 'text', 'time', 'parent', 'label', 'follower_count', 'user_id'])
dev_df['responses'] = np.asarray(dev_responses)

dev_reply_df = DataFrame(dev_reply, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])

test_df = DataFrame(test_list, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])

# Task 2
covid_df = DataFrame(covid_list, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])


# Merged DF
frames = [train_df, dev_df]

merged_df = pd.concat(frames)

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_sentiment_score_ind(text):

    return sid.polarity_scores(text)['compound']

def sent_dict(df):
    parent_ids = list(df['parent'].unique())
    sentiment_dict = {}
    for parent in parent_ids:
        replies = df[df['parent'] == parent]['text']

        total_sent = []
        for tweet in replies:
            score = get_sentiment_score_ind(tweet)
            total_sent.append(score)

        mean = np.mean(total_sent)
#         median = np.median(total_sent)
        sentiment_dict[parent] = mean
    return sentiment_dict

def senti_results(df, label, senti_dict):
    out_list = []
    for tw in list(df[df['label'] == label]['tweet_id']):
        try:
            out_list.append(senti_dict[tw])
        except KeyError:
            continue
    return out_list

In [6]:
# Get Sentiment of Replies
# train_senti_dict = sent_dict(train_reply_df) 
# dev_senti_dict = sent_dict(dev_reply_df) 

# train_rumour_reply_sent = senti_results(train_df, 1, train_senti_dict)
# train_nonrumour_reply_sent = senti_results(train_df, 0, train_senti_dict)
# dev_rumour_reply_sent = senti_results(dev_df, 1, dev_senti_dict)
# dev_nonrumour_reply_sent = senti_results(dev_df, 0, dev_senti_dict)

In [7]:
train_sentences = list(train_df.text.values)
train_labels = list(train_df.label.values)
dev_sentences = list(dev_df.text.values)
dev_labels = list(dev_df.label.values)
test_sentences = list(test_df.text.values)

merged_sentences = list(merged_df.text.values)
merged_labels = list(merged_df.label.values)

test_text = list(test_df.text.values)

# BoW Model

vectorizer = CountVectorizer(min_df=0, lowercase=False)
# vectorizer.fit(train_sentences)
vectorizer.fit(merged_sentences)

X_train = vectorizer.transform(merged_sentences)
y_train = merged_labels


X_test = vectorizer.transform(test_sentences)
# X_train = vectorizer.transform(train_sentences)
# y_train = train_labels
# X_dev = vectorizer.transform(dev_sentences)
# y_dev = dev_labels
# len(vectorizer.vocabulary_)

# vectorizer.transform(sentences).toarray()

In [8]:
LRclassifier = LogisticRegression()
LRclassifier.fit(X_train, y_train)

SGDclass = SGDClassifier()
SGDclass.fit(X_train, y_train)

NBclassifier = MultinomialNB()
NBclassifier.fit(X_train, y_train)


# score = classifier.score(X_dev, y_dev)

MultinomialNB()

In [9]:
# lr_pred_test = LRclassifier.predict(X_test)
lr_pred_dev_prob = LRclassifier.predict_proba(X_dev)

# sg_pred_test = SGDclass.predict(X_test)
sg_pred_dev_prob = SGDclass.predict_proba(X_dev)

# nb_pred_test = NBclassifier.predict(X_test)
nb_pred_dev_prob = NBclassifier.predict_proba(X_dev)

NameError: name 'X_dev' is not defined

In [None]:
def class_label(num):
    if num == 0:
        return 'non-rumour'
    else:
        return 'rumour'

out_label = []

for num in list(nb_pred_test):
    out_label.append(class_label(num))

In [None]:
# out_label
# id_list = [str(i) for i in list(test_df.tweet_id)]
# out_dict = {}

# for i, t in enumerate(id_list):
#     out_dict[t] = out_label[i]

In [None]:
# with open("test_out_nb.json", "w") as outfile: 
#     json.dump(out_dict, outfile)

In [None]:
# out_dict

In [None]:
# Dev set statistics
print("LR Model")
print("precision:", precision_score(y_dev, lr_pred_dev))
print("recall:", recall_score(y_dev, lr_pred_dev))
print("f1:", f1_score(y_dev, lr_pred_dev))

# Dev set statistics
print("SG Model")
print("precision:", precision_score(y_dev, sg_pred_dev))
print("recall:", recall_score(y_dev, sg_pred_dev))
print("f1:", f1_score(y_dev, sg_pred_dev))


# Dev set statistics
print("NB Model")
print("precision:", precision_score(y_dev, nb_pred_dev))
print("recall:", recall_score(y_dev, nb_pred_dev))
print("f1:", f1_score(y_dev, nb_pred_dev))
# print(score)

In [None]:
# # If the reply sentiment score is lower than -0.1, increase prob of rumour
# for t_id in train_senti_dict.keys():
#     try:
#         if train_senti_dict[t_id] <= -0.1:
#             edit = pred_dev_dict[t_id]
#             edit[1] = edit[1] + 0.4
#             pred_dev_dict[t_id] = edit
#     except KeyError:
#         continue
        