# CS4242 Lab 1: Microblog Classification

**Author: Li Jiazhe (A0176576M)**

In [69]:
import sys
import os
import re
import nltk
from nltk.corpus import stopwords 
import simplejson as json
import numpy as np
import pandas as pd

### Step 1: Run Basic Classifier (given code)

In [70]:
def rm_html_tags(str): 
    html_prog = re.compile(r'<[^>]+>',re.S) 
    return html_prog.sub('', str) 

def rm_html_escape_characters(str):
    pattern_str = r'&quot;|&amp;|&lt;|&gt;|&nbsp;|&#34;|&#38;|&#60;|&#62;|&#160;|&#20284;|&#30524;|&#26684|&#43;|&#20540|&#23612;'
    escape_characters_prog = re.compile(pattern_str, re.S)
    return escape_characters_prog.sub('', str)

def rm_at_user(str):
    return re.sub(r'@[a-zA-Z_0-9]*', '', str)

def rm_url(str): 
    return re.sub(r'http[s]?:[/+]?[a-zA-Z0-9_\.\/]*', '', str)

def rm_repeat_chars(str):
    return re.sub(r'(.)(\1){2,}', r'\1\1', str)

def rm_hashtag_symbol(str):
    return re.sub(r'#', '', str)

def replace_emoticon(emoticon_dict, str):
    for k, v in emoticon_dict.items():
        str = str.replace(k, v)
    return str

def rm_time(str):
    return re.sub(r'[0-9][0-9]:[0-9][0-9]', '', str)

def rm_punctuation(current_tweet):
    return re.sub(r'[^\w\s]','',current_tweet)


def pre_process(str, porter):
    # do not change the preprocessing order only if you know what you're doing 
    str = str.lower()
    str = rm_url(str)        
    str = rm_at_user(str)        
    str = rm_repeat_chars(str) 
    str = rm_hashtag_symbol(str)       
    str = rm_time(str)        
    str = rm_punctuation(str)
        
    try:
        str = nltk.tokenize.word_tokenize(str)
        try:
            str = [porter.stem(t) for t in str]
        except:
            print(str)
            pass
    except:
        print(str)
        pass
        
    return str
                            



if __name__ == "__main__":
    data_dir = 'D:/Li Jiazhe/NUS/Semester 2/CS4242 Social Media Computing/LAB1/LAB1/data/data'  ##Setting your own file path here.

    x_filename = 'samples.txt'
    y_filename = 'labels.txt'

    porter = nltk.PorterStemmer()
    stops = set(stopwords.words('english'))
    stops.add('rt') 


    ##load and process samples
    print('start loading and process samples...')
    words_stat = {} 
    tweets = []
    hashtags_all = []
    name_all = []
    cnt = 0
    with open(os.path.join(data_dir, x_filename), encoding='utf-8') as f:
        for i, line in enumerate(f):
            postprocess_tweet = []
            tweet_obj = json.loads(line.strip(), encoding='utf-8') 
            description = tweet_obj['user']['description'].replace("\n"," ")
            content = tweet_obj['text'].replace("\n"," ")
            hashtags_dic = tweet_obj['entities']['hashtags']
            hashtags = [d['text'] for d in hashtags_dic]
            truncated_text = ['truncated_or_not_' + str(tweet_obj['truncated'])]
            user_name = tweet_obj['user']['name']
            text_words = pre_process(content, porter)
            des_words = pre_process(description, porter)
            user_name_words = pre_process(user_name, porter)
            words = text_words + des_words + hashtags+ hashtags + truncated_text + user_name_words + hashtags
            for word in words:
                if word not in stops:
                    postprocess_tweet.append(word)
                    if word in words_stat.keys():
                        words_stat[word][0] += 1 
                        if i != words_stat[word][2]:
                            words_stat[word][1] += 1
                            words_stat[word][2] = i
                    else:
                        words_stat[word] = [1,1,i]
            tweets.append(' '.join(postprocess_tweet))
            
            hashtags_all.append(' '.join(hashtags))
            hashtags_all = ['standsfornull' if x == '' else x for x in hashtags_all ]
            
            name_all.append(' '.join(user_name_words))
            


    print("The number of unique words in data set is %i." %len(words_stat.keys())) 
    # 12344 for text only, 19361 for adding user description, 24189 for adding hashtags, 24191 for adding truncated text
    # 27516 for adding user name
    lowTF_words = set()
    with open(os.path.join(data_dir, 'words_statistics.txt'), 'w', encoding='utf-8') as f:
        f.write('TF\tDF\tWORD\n')
        for word, stat in sorted(words_stat.items(), key=lambda i: i[1], reverse=True):
            f.write('\t'.join([str(m) for m in stat[0:2]]) + '\t' + word +  '\n')
            if stat[0]<2:
                lowTF_words.add(word)
    print("The number of low frequency words is %d." %len(lowTF_words))
    # 7079 for text only, 11222 for adding user description, 13995 for adding hashtags, 13995 for adding truncated text
    # 16028 for adding user name, 12707 for adding hashtags again.


    ###Re-process samples, filter low frequency words...
    fout = open(os.path.join(data_dir, 'samples_processed.txt'), 'w', encoding='utf-8')
    tweets_new = []
    for tweet in tweets:
        words = tweet.split(' ')
        new = [] 
        for w in words:
            if (w not in lowTF_words) or (w in hashtags_all) or (w in name_all):
                new.append(w)
        new_tweet = ' '.join(new)
        tweets_new.append(new_tweet)
        fout.write('%s\n' %new_tweet)
    fout.close()
    
    print("Preprocessing is completed")
    

start loading and process samples...
The number of unique words in data set is 27516.
The number of low frequency words is 12707.
Preprocessing is completed


 - Feature Extraction

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_dir = 'D:/Li Jiazhe/NUS/Semester 2/CS4242 Social Media Computing/LAB1/LAB1/data/data' 

# print("Loading data...")
with open(os.path.join(data_dir, 'samples_processed.txt'), 'r', encoding='utf-8') as f:
	x = f.readlines()
with open(os.path.join(data_dir, 'labels.txt'), 'r', encoding='utf-8') as f:
	y = np.array(f.readlines())

# print("Extract features...")
x_feats = TfidfVectorizer().fit_transform(x)
print(x_feats.shape) # 5237 for text only, 8101 for adding user description, 9713 for adding hashtags, 9715 for adding 
# truncated text, 11830 for adding user name, 14131 for adding hashtags again.

(6000, 14131)


## Basic Classifier: Naive Bayes Classifier

In [72]:
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [73]:
from sklearn.naive_bayes import MultinomialNB

# print("Start training and predict...")
fold = 10
kf = KFold(n_splits=fold)
avg_p = 0
avg_r = 0
avg_f1 = 0
for train, test in kf.split(x_feats): # 10 rounds
    model = MultinomialNB().fit(x_feats[train], y[train]) 
    predicts = model.predict(x_feats[test])
    avg_p += precision_score(y[test],predicts, average='macro')
    avg_r += recall_score(y[test],predicts, average='macro')
    avg_f1 += f1_score(y[test],predicts, average='macro')

nb_precision = avg_p/fold
nb_recall = avg_r/fold
nb_f1 = avg_f1/fold
print('Average Precision is %f.' %(nb_precision))
print('Average Recall is %f.' %(nb_recall))
print('F1 score is %f.' %(nb_f1))

Average Precision is 0.697134.
Average Recall is 0.690572.
F1 score is 0.689687.


## Step2: Try other classifiers

### KNN Classifier

In [74]:
from sklearn.neighbors import KNeighborsClassifier

# print("Start training and predict...")
fold = 10
kf = KFold(n_splits=fold)
K = [3,4,5,6,7,8,9,10,11,12,13,14,15]
avg_p_final = []
avg_r_final = []
avg_f1_final = []
for k in K:
    avg_p = 0
    avg_r = 0
    avg_f1 = 0
    for train, test in kf.split(x_feats):
        model = KNeighborsClassifier(n_neighbors=k).fit(x_feats[train], y[train])
        predicts = model.predict(x_feats[test])
#         print(classification_report(y[test],predicts))
        avg_p += precision_score(y[test],predicts, average='macro')
        avg_r += recall_score(y[test],predicts, average='macro')
        avg_f1 += f1_score(y[test],predicts, average='macro')
    avg_p_final.append(avg_p/fold)
    avg_r_final.append(avg_r/fold)
    avg_f1_final.append(avg_f1/fold)

knn_precision = max(avg_f1_final)
opt_k = K[avg_f1_final.index(knn_precision)]
knn_recall = avg_r_final[avg_f1_final.index(knn_precision)]
knn_f1 = avg_f1_final[avg_f1_final.index(knn_precision)]
print('The best k for knn is %d and the optimal precision is %f.' % (opt_k, knn_precision))
print('Average Recall is %f.' % knn_recall)
print('F1 score is %f.' %(knn_f1))

The best k for knn is 12 and the optimal precision is 0.597731.
Average Recall is 0.598760.
F1 score is 0.597731.


### Decision Tree Classifier

In [75]:
from sklearn import tree

# print("Start training and predict...")
fold = 10
kf = KFold(n_splits=fold)

avg_p = 0
avg_r = 0
avg_f1 = 0
for train, test in kf.split(x_feats):
    dt = tree.DecisionTreeClassifier(max_depth = 100)
    model = dt.fit(x_feats[train], y[train])
    predicts = model.predict(x_feats[test])
#         print(classification_report(y[test],predicts))
    avg_p += precision_score(y[test],predicts, average='macro')
    avg_r += recall_score(y[test],predicts, average='macro')
    avg_f1 += f1_score(y[test],predicts, average='macro')


tree_precision = avg_p/fold
tree_recall = avg_r/fold
tree_f1 = avg_f1/fold
print('Average Precision of decision tree is %f' % (tree_precision))
print('Average Recall is %f.' % tree_recall)
print('F1 score is %f.' %(tree_f1))

Average Precision of decision tree is 0.584852
Average Recall is 0.521570.
F1 score is 0.534734.


### Random Forest

In [76]:
from sklearn.ensemble import RandomForestClassifier

avg_p = 0
avg_r = 0
avg_f1 = 0
for train, test in kf.split(x_feats):
    rf = RandomForestClassifier(n_estimators = 500, max_features = 7, random_state=0)
    model = rf.fit(x_feats[train], y[train])
    predicts = model.predict(x_feats[test])
#         print(classification_report(y[test],predicts))
    avg_p += precision_score(y[test],predicts, average='macro')
    avg_r += recall_score(y[test],predicts, average='macro')
    avg_f1 += f1_score(y[test],predicts, average='macro')

rf_precision = avg_p/fold
rf_recall = avg_r/fold
rf_f1 = avg_f1/fold
print('Average Precision of RandomForest is %f' % (rf_precision))
print('Average Recall is %f.' % rf_recall)
print('F1 score is %f.' %(rf_f1))

Average Precision of RandomForest is 0.710513
Average Recall is 0.708644.
F1 score is 0.706380.
