In [1]:
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
from text_cleaner import TextCleaner

### Load the dataset

In [3]:
df = pd.read_csv('../data/NFLX.csv.gz')

In [4]:
df.head()

Unnamed: 0,created_at,user_name,user_id,bear_bull_tag,text,message_id
0,2022-01-25 02:17:32+00:00,nightwatchman99,1635725,Bullish,$NFLX shhh,429999796
1,2022-01-25 02:13:48+00:00,OptionsPlayers,619769,NIL,$AMD $NVDA $NFLX Print and Post this in your t...,429999027
2,2022-01-25 02:05:04+00:00,LucidDreamer,4651936,Bullish,$NFLX $400 weekly calls up over 70% already. H...,429997295
3,2022-01-25 02:03:08+00:00,Christmas_is_my_favorit7,4346156,NIL,$NFLX mark this post. We see 350 again. I&#39...,429996910
4,2022-01-25 02:02:51+00:00,Marlin2008,3168679,NIL,$NFLX 350 I’m loading the boat. Will see if it...,429996849


In [5]:
user_tweet_counts = Counter(df['user_name'])
df['tweet_count'] = df['user_name'].map(user_tweet_counts)

In [6]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['hour_of_day'] = df['created_at'].dt.hour

In [7]:
cleaner = TextCleaner()

[nltk_data] Downloading package punkt to /Users/anshulrao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anshulrao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anshulrao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df['text'] = df.apply(lambda r: cleaner.clean_text(r.text), axis=1)

### Use Latent Dirichlet Allocation (LDA) to extract topics from the text

In [9]:
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
lda = LatentDirichletAllocation(n_components=5, random_state=42)

In [10]:
text_features = vectorizer.fit_transform(df['text'])
topic_features = lda.fit_transform(text_features)

In [11]:
df[['topic_{}'.format(i) for i in range(5)]] = topic_features

In [12]:
features = df[['tweet_count', 'hour_of_day'] + ['topic_{}'.format(i) for i in range(5)]]

In [14]:
features.head()

Unnamed: 0,tweet_count,hour_of_day,topic_0,topic_1,topic_2,topic_3,topic_4
0,14,2,0.101347,0.101701,0.102274,0.102841,0.591837
1,439,2,0.012613,0.322967,0.302334,0.012961,0.349125
2,18,2,0.029245,0.028706,0.029006,0.029752,0.883291
3,31,2,0.020224,0.154577,0.020203,0.020123,0.784874
4,3,2,0.067039,0.06716,0.067278,0.067415,0.731109


### Detecting outliers using Isolation Forest

In [16]:
model = make_pipeline(StandardScaler(), IsolationForest(contamination=0.01, random_state=42))
df['bot_score'] = model.fit_predict(features)

### Display potential bots

In [22]:
potential_bots = df[df['bot_score'] == -1]
print("Potential Bots:")
print(potential_bots.user_name.unique())

Potential Bots:
['ChartMill' 'Newsfilter' 'GrindTime' 'ElliottwaveForecast' 'Sliver'
 'STCKPRO' 'Trading4Living']


Almost all of these accounts seem to be business accounts with thousands of followers and they tweet frequently and in a systematic way.