In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from src.coordinator import Coordinator
from src.utils.preprocessing import TextPreprocessor

In [4]:
coord = Coordinator()

In [5]:
df = pd.read_json(coord.data_interim.joinpath('dataset_v1.jsonl'), lines=True)

In [23]:
user_types = {'musicians': ['taylorswift13', 'shakira', 'selenagomez', 'rihanna', 'katyperry', 'justinbieber', 'jtimberlake', 'britneyspears', 
                            'ArianaGrande', 'ladygaga'], 
             'politician': ['realDonaldTrump', 'narendramodi', 'BarackObama'],
             'brand': ['YouTube', 'Twitter', 'cnnbrk'], 
             'entertainer': ['jimmyfallon', 'TheEllenShow'],
             'media_personality': ['KimKardashian'],  
             'sportsperson': ['Cristiano']}

def get_user_type(user):
    for user_type in user_types:
        if user in user_types[user_type]:
            return user_type

df['user_type'] = df['user'].apply(get_user_type)

In [24]:
df.head(3)

Unnamed: 0,tweet_id,user,time_epoch,tweet,n_likes,n_retweets,n_replies,n_emojis,quoted_tweet,hashtags,mentions,quoted_tweet_screen_name,quoted_tweet_name,quoted_tweet_hashtags,quoted_tweet_mentions,quoted_tweet_n_emojis,user_type
0,1249733419401183232,KimKardashian,1586794640,"[restock, waist, trainer, clay, onyx, size, xx...",3850,166,340,0,,[],[skims],,,[],[],0,media_personality
1,1249338384847220738,KimKardashian,1586700457,"[sweet, baby, true, happy, birthday, celebrate...",30039,1075,174,2,,[],[],,,[],[],0,media_personality
2,1249080285972410368,KimKardashian,1586638921,"[happen, text, message, iphone, blank, convers...",23384,797,2060,0,,[],[],,,[],[],0,media_personality


In [8]:
preprocessor = TextPreprocessor(lowercase=True, clean_links=True, clean_punctuation=True, expand_contractions=True, remove_stop_words=True, 
                                process_numbers='remove', normalization_type='lemma', clean_mentions=True)

In [9]:
df['tweet'] = df['tweet'].apply(preprocessor)

In [15]:
df = df[df['tweet'].apply(lambda tweet: bool(tweet))]

In [25]:
X, y = df['tweet'], df['user_type']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20000, random_state=42, stratify=y)

In [29]:
len(X_test)

20000

In [34]:
tfidf = TfidfVectorizer(lowercase=False, tokenizer=lambda tokens: tokens, max_features=50000)
tfidf.fit(df['tweet'])



TfidfVectorizer(lowercase=False, max_features=50000,
                tokenizer=<function <lambda> at 0x7ff2713ced90>)

In [35]:
X_train, X_test = tfidf.transform(X_train), tfidf.transform(X_test)

# Classification

## LogisticRegression

In [30]:
from sklearn.linear_model import LogisticRegression

In [36]:
%%time
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)

CPU times: user 54.6 s, sys: 28.1 s, total: 1min 22s
Wall time: 32.5 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8262

## SVC

In [37]:
from sklearn.svm import LinearSVC

In [38]:
%%time
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
linear_svc.score(X_test, y_test)

CPU times: user 7.06 s, sys: 4.08 ms, total: 7.07 s
Wall time: 7.07 s


0.8446

## KNN

In [42]:
from sklearn.neighbors import KNeighborsClassifier

In [43]:
%%time
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

CPU times: user 1min 6s, sys: 20.1 s, total: 1min 26s
Wall time: 1min 27s


0.47535

# TopicModeling

### LDA

In [44]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=6, random_state=42)
lda.fit(tfidf.transform(X))

LatentDirichletAllocation(n_components=6, random_state=42)

In [45]:
vocab = tfidf.get_feature_names()

n_top_words = 10

topic_words = {}

for topic, comp in enumerate(lda.components_):    
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    print([vocab[x] for x in word_idx],"\n")

['president', 'trump', 'obama', 'great', 'vote', 'country', 'america', 'donald', 'people', 'job'] 

['india', 'people', 'shak', 'en', 'ji', 'la', 'development', 'work', 'nation', 'el'] 

['love', 'rt', 'happy', 'birthday', 'great', 'good', 'day', 'hope', 'time', 'today'] 

['police', 'kill', 'official', 'people', 'attack', 'death', 'suspect', 'report', 'shoot', 'dead'] 

['tonight', 'rt', 'love', 'fallontonight', 'watch', 'day', 'today', 'music', 'play', 'season'] 

['video', 'rt', 'love', 'क', 'wait', 'guy', 'watch', 'day', 'excite', 'today'] 



# Clustering

In [49]:
from collections import Counter

In [54]:
from sklearn.cluster import KMeans
from sklearn import metrics

In [52]:
X = tfidf.transform(X)

In [56]:
%%time
kmeans = KMeans(n_clusters=6)
kmeans.fit(X)

CPU times: user 1min 17s, sys: 341 ms, total: 1min 17s
Wall time: 25.4 s


KMeans(n_clusters=6)

In [57]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder().fit(y)

In [61]:
metrics.adjusted_rand_score(kmeans.predict(X), le.transform(y))

0.023316056291637462