# The Askeladden Algorithm - Initial Dataset Creation <a class="tocSkip">

## W207 | Applied Machine Learning | Spring 2019<a class="tocSkip">

### Team Troll Trappers: Laura Pintos, Ramiro Cadavid, and Anna Jacobson<a class="tocSkip">

In [2]:
# General libraries.
import numpy as np
import pandas as pd
import csv
import re

### Troll Dataset

In [13]:
# Read in csv file.
df = pd.read_csv("ira_tweets_english.csv", dtype=object)

In [14]:
# Create new column for tweet year.
df['tweet_year'] = df['tweet_time'].str[:4].astype('int64')

In [15]:
# Filter for 2016-2018 tweets.
years = ['2016', '2017', '2018']
df = df[df['tweet_year'].isin(years)]

In [16]:
# Filter for user screen names containing specific words.
news = ['Daily', 'New', 'Today', 'Online']
esc_lst = [re.escape(s) for s in news]
pattern = '|'.join(esc_lst)
df = df[df['user_screen_name'].str.contains(pattern, case=False)]

In [18]:
# Check dataframe shape.
df.shape

(296949, 33)

In [19]:
# Examine data.
df.head()

Unnamed: 0.1,Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,...,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices,tweet_year
1523823,4500995,682713637161549824,2601235821,Pittsburgh Today,TodayPittsburgh,"Pittsburgh, PA",Pittsburgh's local news on Twitter. Breaking n...,,20790,10408,...,absent,0.0,0.0,0.0,0.0,[news],[],,,2016
715787,2171579,682714058752012290,3074563039,St. Louis Online,StLouisOnline,"St Louis, MO","Breaking news, weather, traffic and more for S...",,10534,8576,...,absent,0.0,0.0,0.0,0.0,[sports],[],,,2016
912484,2737486,682714038904557568,2547141851,Chicago Daily News,ChicagoDailyNew,"Chicago, IL","Local news, sports, business, politics, entert...",,23595,13665,...,absent,0.0,0.0,1.0,0.0,[Breaking],[http://bit.ly/1mSVsvg],,,2016
2308615,6775548,682714784526041088,2494112058,San Jose Daily,DailySanJose,USA,"Follow for San Jose's breaking news, special r...",,20135,9313,...,absent,0.0,0.0,1.0,1.0,[SanJose],[],,,2016
2274939,6677232,682714630930563074,2743327187,Washington Online,WashingtOnline,"Washington, D.C.","Breaking news, weather, traffic and more for W...",,40762,14404,...,absent,0.0,0.0,0.0,0.0,"[business, news]",[],,,2016


In [20]:
# Examine number of tweets per user.
df['user_screen_name'].value_counts()

KansasDailyNews    25979
DailySanFran       24881
TodayNYCity        19832
ChicagoDailyNew    17462
OnlineCleveland    14650
StLouisOnline      14040
todayinsyria       12667
DailySanDiego      12353
DailyLosAngeles    12165
TodayPittsburgh    12088
PhoenixDailyNew    12062
DailySanJose       10827
NewOrleansON        9688
SanAntoTopNews      9676
TodayCincinnati     9601
OnlineMemphis       8703
WashingtOnline      8591
DetroitDailyNew     7731
OaklandOnline       7406
DallasTopNews       7336
TodayMiami          6957
NewspeakDaily       6910
TodayBostonMA       5324
PigeonToday         3824
Atlanta_Online      3770
HoustonTopNews      3604
ElPasoTopNews       2435
BlackNewsOutlet     2146
tpartynews          1531
redlanews           1462
MissouriNewsUS      1162
NatPolNews            82
TribunaOnline24        4
Name: user_screen_name, dtype: int64

In [21]:
# Check how many unique users there are.
df['user_screen_name'].nunique()

33

In [22]:
# Check to make sure that the user IDs are unique.
df['userid'].value_counts()

2587843805            25979
2495567768            24881
2752677905            19832
2547141851            17462
2753146444            14650
3074563039            14040
3899481526            12667
2630842499            12353
2624554209            12165
2601235821            12088
2753211010            12062
2494112058            10827
2530830345             9688
2753338899             9676
2577082467             9601
2570017414             8703
2743327187             8591
2571870453             7731
3074013672             7406
2675966513             7336
2944944427             6957
2928870434             6910
2591847731             5324
2912754262             3824
2944766250             3770
2628066159             3604
2537507303             2435
4301962823             2146
3990577513             1531
4289431230             1462
4208754922             1162
2951539528               82
731456806082498560        4
Name: userid, dtype: int64

In [23]:
# Add category label.
df['category'] = 'troll'

In [24]:
# Create new dataframe with tweet text and category label only.
trolls = df[['tweet_text', 'category']]

In [26]:
# Examine data.
trolls.head()

Unnamed: 0,tweet_text,category
1523823,Old-Fashioned Oakdale Barber Shop Closes After...,troll
715787,"Former Cardinals manager Rapp, 87, dies #sports",troll
912484,#Breaking Divvy raises rates for annual bike-s...,troll
2308615,Santa Clara: Hearing set for suit to halt NFL'...,troll
2274939,George Lucas apologizes for calling Disney 'wh...,troll


In [27]:
# Take a random sample using total number of real news tweets (see below).
trolls = trolls.sample(n=153188, random_state=2019)

In [28]:
# Check shape.
trolls.shape

(153188, 2)

### Real News Dataset

In [3]:
# Read in csv file.
df2 = pd.read_csv("real_news.csv")

In [4]:
# Check dataframe shape.
df2.shape

(153188, 5)

In [5]:
# Examine data.
df2.head()

Unnamed: 0.1,Unnamed: 0,user_id,user_screen_name,id,text
0,0,9300262,politico,864621529765748736,Russia shrugs off reports Trump shared classif...
1,1,15513604,foxandfriends,935087093352685569,Texas man reportedly imprisoned in United Arab...
2,2,15164565,Slate,961389241791021062,Giannis Antetokounmpo jumped over a 6-foot-6 d...
3,3,759251,CNN,796188356690530304,Dramatic mood shift at Clinton HQ; Many aides ...
4,4,15513604,foxandfriends,950322509903269888,RT @JaniceDean: A warm up for the eastern half...


In [7]:
# Rename column.
df2 = df2.rename(columns={'text': 'tweet_text'})

In [8]:
# Examine number of tweets per user.
df2['user_screen_name'].value_counts()

thehill            12951
FoxNews            11554
politico            9236
CNN                 8543
washingtonpost      8068
nytimes             7439
CBSNews             6746
TheEconomist        5882
CNNPolitics         5019
latimes             4846
Slate               4801
USATODAY            4586
ABCPolitics         3869
dallasnews          3794
NBCNews             3529
ArkansasOnline      3116
chicagotribune      3019
MSNBC               2968
TheStreet           2966
DRUDGE_REPORT       2965
fox5dc              2847
BuzzFeedNews        2469
nbcsandiego         2463
NPR                 2431
HuffPost            2356
NewarkAdvocate      2197
NewYorker           2194
foxandfriends       2185
MotherJones         2080
FreeBeacon          2076
WSJPolitics         1664
YahooNews           1578
theblaze            1462
BreitbartNews       1352
elpasotimes         1286
BBCNorthAmerica     1083
foxnewsradio         838
WSJopinion           566
newsmax              557
TheDailyShow         541


In [9]:
# Check how many unique users there are.
df2['user_screen_name'].nunique()

49

In [10]:
# Add category label.
df2['category'] = 'real'

In [11]:
# Create new dataframe with tweet text and category label only.
real = df2[['tweet_text', 'category']]

(153188, 2)

In [12]:
# Examine data.
real.head()

Unnamed: 0,tweet_text,category
0,Russia shrugs off reports Trump shared classif...,real
1,Texas man reportedly imprisoned in United Arab...,real
2,Giannis Antetokounmpo jumped over a 6-foot-6 d...,real
3,Dramatic mood shift at Clinton HQ; Many aides ...,real
4,RT @JaniceDean: A warm up for the eastern half...,real


### Combine Data

In [30]:
# Concatenate the two datasets created above.
result = pd.concat([trolls, real])

In [31]:
# Check dataframe shape.
result.shape

(306376, 2)

In [32]:
# Reset the data indices.
result = result.reset_index(drop=True)

In [33]:
# Examine the data.
result.head()

Unnamed: 0,tweet_text,category
0,Brewers-Cubs first pitch scheduled for 2:15 p....,troll
1,Even more medicine coming to Liberty Township ...,troll
2,Chicago fugitive apprehended in Lithonia https...,troll
3,11 killed in East TN wildfires; TBI investigat...,troll
4,GOP campaigns flock to Fargo to vie for delega...,troll


In [34]:
# Write to new csv file.
result.to_csv("news_tweets_big.csv")