In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
pd.set_option('use_inf_as_na', True)
import csv
from datetime import datetime
import re

In [0]:
### ADDED DATA INTO DATA FOLDER ON GDRIVE ###
path_to_file = '/content/drive/My Drive/data/TwitterFeeds-master/'

**Lets start with user data and their features - any featured created using the tweet data can be joined later**

So this includes the 5000 accounts randomly sampled from our 'climate emergency' dataset and 18000 accounts which have been labelled as either bots or genuine users. (source?) The next steps are:

- Load in the data and standardise columns - are there features that aren't in both and that we can find?
- Apply the features we have created in relation to the user account details - again we can create more of these

**Load in the tweet data (most 200 recent tweets ~ March 2020 for our sample, then variety for our training)**

- Load in the tweet data for both samples and standardise columns etc
- Apply the tweet based features we have created 
- Join these back to the user data - for example average words per tweet for each user etc
- We can now apply a variety of supervised and unsupervised algorithms to the training data to apply to our random sample of 5000 users who contributed to the climate emergency debate during the time we collected data.
- Literature suggests that unsupervised methods often produce better results: 
 - fast greedy (Cresci et al., 2017)
 - digital DNA (Cresci et al., 2016)
 - graph clustering (Ahmed et al., 2013)
 
Happy hunting!

**Data folder in gdrive**

- *5000_accounts_climate.csv* - the 5000 accounts from climate emergency with user features
- *5000_tweets_climate.csv* - most recent 200 tweets from the 5000 accounts 
- *5000_tweets_frequency.csv* - features based on tweet frequency from the tweets of 5000 users
- *training_users_tag.csv* - this is now the 18000 training data we have tweet data on as well
- *training_tweets.txt* - most recent 200 tweets from the 18000 accounts columns = ['dt','text','tweetid','username']

In [0]:
# load in the 5000 users without additional features
users_5000 = pd.read_csv(path_to_file + "5000_accounts_climate.csv")
users_5000.head(1)

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,statuses_count,created_at,default_profile,default_profile_image
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,False,430,291,14866,6039,2019-02-22 04:36:05,True,False


In [0]:
# These are the accounts (from training_users_tag) which we could source tweet data from training_tweets_tag
users_train = pd.read_csv(path_to_file + "training_users_tag.csv")
users_train.head(1)

Unnamed: 0.1,Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,lang,time_zone,location,default_profile,default_profile_image,geo_enabled,profile_image_url,profile_banner_url,profile_use_background_image,profile_background_image_url_https,profile_text_color,profile_image_url_https,profile_sidebar_border_color,profile_background_tile,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,description,created_at,class,tag_stock,tag_politics,tag_pronbot,tag_business,tag_fake_follower,tag_spambot,tag_traditional_spambot
0,0,418,Dennis Crowley,dens,69341,85422,2623,14990,4491,https://t.co/63fYABYs9J,en,,NYC / Kingston,False,False,True,http://pbs.twimg.com/profile_images/7536729177...,https://pbs.twimg.com/profile_banners/418/1398...,True,https://abs.twimg.com/images/themes/theme1/bg.png,0,https://pbs.twimg.com/profile_images/753672917...,87BC44,True,E0FF92,http://abs.twimg.com/images/themes/theme1/bg.png,FFFFFF,0000FF,,"I like to build things (@Foursquare📱, @Stockad...",Wed Jul 05 19:52:46 +0000 2006,human,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
# See what columns match
a = users_5000.columns
b = users_train.columns

missing = list(set(a) - set(b))
print(missing,'are not in the training data!')

['username', 'verified', 'friends', 'followers'] are not in the training data!


 - username = screen_name
 - followers = followers_count
 - no verified field
 - friends = friends_count

In [0]:
# lets add the missing columns to our matches variable
matches = list(set(a) & set(b))
matches = matches + ['screen_name','followers_count','friends_count']

In [0]:
matches

['name',
 'description',
 'location',
 'created_at',
 'default_profile_image',
 'favourites_count',
 'statuses_count',
 'default_profile',
 'id',
 'url',
 'screen_name',
 'followers_count',
 'friends_count']

In [0]:
# add the tag for just the class - tags for the type of bot
tag = [users_train.columns[-8]]
tags = list(users_train.columns[-8:])

In [0]:
print(tag)
print(tags)

['class']
['class', 'tag_stock', 'tag_politics', 'tag_pronbot', 'tag_business', 'tag_fake_follower', 'tag_spambot', 'tag_traditional_spambot']


In [0]:
# add your choice to 
matches = matches + tag
# apply this to the training data so we have standardised columns
users_train_2 = users_train[matches]
print(users_train_2.columns)
users_train_2.head(1)

Index(['name', 'description', 'location', 'created_at',
       'default_profile_image', 'favourites_count', 'statuses_count',
       'default_profile', 'id', 'url', 'screen_name', 'followers_count',
       'friends_count', 'class'],
      dtype='object')


Unnamed: 0,name,description,location,created_at,default_profile_image,favourites_count,statuses_count,default_profile,id,url,screen_name,followers_count,friends_count,class
0,Dennis Crowley,"I like to build things (@Foursquare📱, @Stockad...",NYC / Kingston,Wed Jul 05 19:52:46 +0000 2006,False,14990,69341,False,418,https://t.co/63fYABYs9J,dens,85422,2623,human


In [0]:
# we'll drop verified for now, even though it will be useful - we can add back later.
users_5000_2 = users_5000.drop(columns=['verified'])

In [0]:
# rename the columns to the same
users_train_2.rename(columns={'screen_name': 'username', 'followers_count': 'followers', 'friends_count': 'friends'}, inplace=True)
users_train_2.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,description,location,created_at,default_profile_image,favourites_count,statuses_count,default_profile,id,url,username,followers,friends,class
0,Dennis Crowley,"I like to build things (@Foursquare📱, @Stockad...",NYC / Kingston,Wed Jul 05 19:52:46 +0000 2006,False,14990,69341,False,418,https://t.co/63fYABYs9J,dens,85422,2623,human


**Normalised data pre-processing.**

In [0]:
accounts_train = users_train_2
del users_train_2
print(accounts_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19675 entries, 0 to 19674
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   name                   19673 non-null  object
 1   description            12898 non-null  object
 2   location               11553 non-null  object
 3   created_at             19675 non-null  object
 4   default_profile_image  8185 non-null   object
 5   favourites_count       19675 non-null  int64 
 6   statuses_count         19675 non-null  int64 
 7   default_profile        12097 non-null  object
 8   id                     19675 non-null  int64 
 9   url                    5646 non-null   object
 10  username               19675 non-null  object
 11  followers              19675 non-null  int64 
 12  friends                19675 non-null  int64 
 13  class                  19675 non-null  object
dtypes: int64(5), object(9)
memory usage: 2.1+ MB
None


In [0]:
accounts_test = users_5000_2
del users_5000_2
print(accounts_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4963 entries, 0 to 4962
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     4963 non-null   int64 
 1   name                   4963 non-null   object
 2   username               4963 non-null   object
 3   location               3719 non-null   object
 4   url                    2040 non-null   object
 5   description            4314 non-null   object
 6   followers              4963 non-null   int64 
 7   friends                4963 non-null   int64 
 8   favourites_count       4963 non-null   int64 
 9   statuses_count         4963 non-null   int64 
 10  created_at             4963 non-null   object
 11  default_profile        4963 non-null   bool  
 12  default_profile_image  4963 non-null   bool  
dtypes: bool(2), int64(5), object(6)
memory usage: 436.3+ KB
None


In [0]:
# reorder the columns to be aligned
col_order = accounts_test.columns.tolist()
col_order.extend(['class'])
accounts_train = accounts_train[col_order]
accounts_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19675 entries, 0 to 19674
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     19675 non-null  int64 
 1   name                   19673 non-null  object
 2   username               19675 non-null  object
 3   location               11553 non-null  object
 4   url                    5646 non-null   object
 5   description            12898 non-null  object
 6   followers              19675 non-null  int64 
 7   friends                19675 non-null  int64 
 8   favourites_count       19675 non-null  int64 
 9   statuses_count         19675 non-null  int64 
 10  created_at             19675 non-null  object
 11  default_profile        12097 non-null  object
 12  default_profile_image  8185 non-null   object
 13  class                  19675 non-null  object
dtypes: int64(5), object(9)
memory usage: 2.1+ MB


In [0]:
print(accounts_train.default_profile.unique())
print(accounts_train.default_profile_image.unique())

['FALSE' nan 'TRUE' '1']
['FALSE' nan 'TRUE' '1']


In [0]:
# convert the boolean values to (0,1), keep NaN
accounts_test['default_profile'] = accounts_test['default_profile']*1
accounts_test['default_profile_image'] = accounts_test['default_profile_image']*1

nan_value = accounts_train.isnull()
accounts_train[['default_profile', 'default_profile_image']] = np.where(accounts_train[['default_profile', 'default_profile_image']].isin(['TRUE', '1']), 1, 0)
accounts_train[nan_value] = np.NaN

In [0]:
print(accounts_test.info())
print(accounts_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4963 entries, 0 to 4962
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     4963 non-null   int64 
 1   name                   4963 non-null   object
 2   username               4963 non-null   object
 3   location               3719 non-null   object
 4   url                    2040 non-null   object
 5   description            4314 non-null   object
 6   followers              4963 non-null   int64 
 7   friends                4963 non-null   int64 
 8   favourites_count       4963 non-null   int64 
 9   statuses_count         4963 non-null   int64 
 10  created_at             4963 non-null   object
 11  default_profile        4963 non-null   int64 
 12  default_profile_image  4963 non-null   int64 
dtypes: int64(7), object(6)
memory usage: 504.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19675 entries, 0 to 1

So we now have our training data and climate emergency accounts in a normalised format. Next job is to feature engineer both.

In [0]:
accounts_train.head()

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,created_at,default_profile,default_profile_image,class
0,418,Dennis Crowley,dens,NYC / Kingston,https://t.co/63fYABYs9J,"I like to build things (@Foursquare📱, @Stockad...",85422,2623,14990,69341,Wed Jul 05 19:52:46 +0000 2006,0.0,0.0,human
1,586,Chris Sacca,sacca,The Rocky Mountains,,I love @crystale & our 3 girls. Used to invest...,1669241,1004,301974,71148,Thu Jul 13 09:05:49 +0000 2006,0.0,0.0,human
2,8557,Japhy Grant,japhygrant,"Los Angeles, CA",https://t.co/jTIWv0ILjC,"👨‍🚀🌵Founder, #WonderValleyProjects. Had a hand...",3207,122,8901,15191,Wed Oct 11 19:51:10 +0000 2006,0.0,0.0,human
3,12522,C.C. Chapman,cc_chapman,"Boston, MA",,Storyteller trying to leave it better than I f...,47372,22593,12468,139668,Wed Nov 15 15:03:14 +0000 2006,0.0,0.0,human
4,15913,Larry Hryb,majornelson,In your Xbox,https://t.co/UNpCOu2HKp,"""The Xbox Guy"" 🎮 • 💑@thehappygirl • Twitter & ...",1127687,5,7,54804,Wed Nov 22 20:28:03 +0000 2006,0.0,0.0,human


In [0]:
accounts_test.head()

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,created_at,default_profile,default_profile_image
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,430,291,14866,6039,2019-02-22 04:36:05,1,0
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",4643,4904,257039,41696,2010-09-16 10:30:21,1,0
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,0,3,1,8,2019-10-18 12:14:42,1,1
3,1220868357647368193,Noles-4-Life-In-SC,Noles4LifeInSC,,,Florida native living in Upstate South Carolin...,628,630,1852,1862,2020-01-25 00:39:20,1,0
4,347142932,Life Cycle UK,LifeCycleUKteam,Bristol,http://t.co/oCrHfe7xAg,Life Cycle UK is a Bristol-based charity that ...,3105,1645,3039,5349,2011-08-02 10:21:33,0,0


In [0]:
# create functions for adding features to both datasets - we can add more features as more are developed!

In [0]:
# days since account has been open
accounts_test['date_created'], accounts_test['time'] = accounts_test['created_at'].str.split(' ', 1).str
accounts_test['date_created'] = pd.to_datetime(accounts_test.date_created, format='%Y-%m-%d')
accounts_test['day'] = ('2020-04-11')
accounts_test['day'] = pd.to_datetime(accounts_test.day, format='%Y-%m-%d')
accounts_test['days_active'] = (accounts_test['day'] - accounts_test['date_created']).dt.days

if 'day' in accounts_test:
    accounts_test = accounts_test.drop(columns=['time','day','created_at'])

else:
    accounts_test.head()

  """Entry point for launching an IPython kernel.


In [0]:
# days since account has been open

# deal with UNIX timestamp data
l = accounts_train.created_at.apply(lambda x: x.endswith('L'))
accounts_train.created_at[l] = accounts_train.created_at[l].str.replace('L', '')
d = pd.to_datetime(accounts_train.created_at[l], unit='ms')
d = d.apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d %X').strftime('%Y-%m-%d'))
d = pd.to_datetime(d, format='%Y-%m-%d')
accounts_train.loc[d.index, 'date_created'] = d

# deal with normal format data
accounts_train.created_at[~l] = accounts_train.created_at[~l].apply(lambda x: datetime.strptime(x, '%a %b %d %X %z %Y').strftime('%Y-%m-%d'))
d1 = pd.to_datetime(accounts_train.created_at[~l], format='%Y-%m-%d')
accounts_train.loc[d1.index, 'date_created'] = d1

accounts_train['day'] = pd.to_datetime('2020-04-11', format='%Y-%m-%d')
accounts_train['days_active'] = (accounts_train['day'] - accounts_train['date_created']).dt.days

if 'day' in accounts_train:
    accounts_train = accounts_train.drop(columns=['day','created_at'])

else:
    accounts_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [0]:
# social figures in relation to account age
accounts_train['followers_age'] = accounts_train['followers'] / accounts_train['days_active']
accounts_train['following_age'] = accounts_train['friends'] / accounts_train['days_active']
accounts_train['favourites_age'] = accounts_train['favourites_count'] / accounts_train['days_active']
accounts_train['tweets_age'] = accounts_train['statuses_count'] / accounts_train['days_active']

# following to follower ratio
accounts_train['followers_ratio'] = accounts_train['friends'] / accounts_train['followers']

accounts_train.head(3)

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,class,date_created,days_active,followers_age,following_age,favourites_age,tweets_age,followers_ratio
0,418,Dennis Crowley,dens,NYC / Kingston,https://t.co/63fYABYs9J,"I like to build things (@Foursquare📱, @Stockad...",85422,2623,14990,69341,0.0,0.0,human,2006-07-05,5029,16.985882,0.521575,2.980712,13.788228,0.030706
1,586,Chris Sacca,sacca,The Rocky Mountains,,I love @crystale & our 3 girls. Used to invest...,1669241,1004,301974,71148,0.0,0.0,human,2006-07-13,5021,332.451902,0.19996,60.142203,14.170086,0.000601
2,8557,Japhy Grant,japhygrant,"Los Angeles, CA",https://t.co/jTIWv0ILjC,"👨‍🚀🌵Founder, #WonderValleyProjects. Had a hand...",3207,122,8901,15191,0.0,0.0,human,2006-10-11,4931,0.650375,0.024741,1.805111,3.080714,0.038042


In [0]:
# social figures in relation to account age
accounts_test['followers_age'] = accounts_test['followers'] / accounts_test['days_active']
accounts_test['following_age'] = accounts_test['friends'] / accounts_test['days_active']
accounts_test['favourites_age'] = accounts_test['favourites_count'] / accounts_test['days_active']
accounts_test['tweets_age'] = accounts_test['statuses_count'] / accounts_test['days_active']

# following to follower ratio
accounts_test['followers_ratio'] = accounts_test['friends'] / accounts_test['followers']

accounts_test.head(3)

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,date_created,days_active,followers_age,following_age,favourites_age,tweets_age,followers_ratio
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,430,291,14866,6039,1,0,2019-02-22,414,1.038647,0.702899,35.908213,14.586957,0.676744
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",4643,4904,257039,41696,1,0,2010-09-16,3495,1.328469,1.403147,73.544778,11.930186,1.056214
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,0,3,1,8,1,1,2019-10-18,176,0.0,0.017045,0.005682,0.045455,


In [0]:
# username features - can only use A-Z, 0-9 and _ + not case-sensitive
accounts_test['username_char_len'] = accounts_test['username'].str.len()

# length of username in comparison to length of real name
accounts_test['name_ratio'] = accounts_test['name'].str.len() / accounts_test['username'].str.len()
    # amount of numbers in username
username_int = []    
for i in accounts_test['username']:
    numbers = sum(c.isdigit() for c in i)
    username_int.append(numbers)
accounts_test['username_int'] = username_int

    # amount of characters
username_char = []    
for i in accounts_test['username']:
    char = sum(c.isalpha() for c in i)
    username_char.append(char)
accounts_test['username_char'] = username_char

    # underscores 
accounts_test['username_other'] = accounts_test['username_char_len'] - (accounts_test['username_int'] + 
                                                                       accounts_test['username_char'])

# amount of numbers at the end of username if there is any
results = []

for i in accounts_test['username']:
    m = re.search(r'\d+$', i)
    if m is None:
        results.append(0)
    elif m is not None:
        count = m.group()
        results.append(len(count))

accounts_test['username_int_end'] = results

# amount of numbers in display name (not common for real users, but maybe for businesses!)
name_int = []    
for i in accounts_test['name']:
    numbers = sum(c.isdigit() for c in i)
    name_int.append(numbers)
accounts_test['name_int'] = name_int

# is there any cases of bots using techniques to make account details appear real, emoji flags, hashtags etc?
accounts_test.head()

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,date_created,days_active,followers_age,following_age,favourites_age,tweets_age,followers_ratio,username_char_len,name_ratio,username_int,username_char,username_other,username_int_end,name_int
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,430,291,14866,6039,1,0,2019-02-22,414,1.038647,0.702899,35.908213,14.586957,0.676744,11,1.818182,1,10,0,1,0
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",4643,4904,257039,41696,1,0,2010-09-16,3495,1.328469,1.403147,73.544778,11.930186,1.056214,8,4.875,3,5,0,3,0
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,0,3,1,8,1,1,2019-10-18,176,0.0,0.017045,0.005682,0.045455,,8,0.875,1,7,0,1,0
3,1220868357647368193,Noles-4-Life-In-SC,Noles4LifeInSC,,,Florida native living in Upstate South Carolin...,628,630,1852,1862,1,0,2020-01-25,77,8.155844,8.181818,24.051948,24.181818,1.003185,14,1.285714,1,13,0,0,1
4,347142932,Life Cycle UK,LifeCycleUKteam,Bristol,http://t.co/oCrHfe7xAg,Life Cycle UK is a Bristol-based charity that ...,3105,1645,3039,5349,0,0,2011-08-02,3175,0.977953,0.51811,0.957165,1.684724,0.529791,15,0.866667,0,15,0,0,0


In [0]:
# username features - can only use A-Z, 0-9 and _ + not case-sensitive
accounts_train['username_char_len'] = accounts_train['username'].str.len()

# length of username in comparison to length of real name
accounts_train['name_ratio'] = accounts_train['name'].str.len() / accounts_train['username'].str.len()
    # amount of numbers in username
username_int = []    
for i in accounts_train['username']:
    numbers = sum(c.isdigit() for c in i)
    username_int.append(numbers)
accounts_train['username_int'] = username_int

    # amount of characters
username_char = []    
for i in accounts_train['username']:
    char = sum(c.isalpha() for c in i)
    username_char.append(char)
accounts_train['username_char'] = username_char

    # underscores 
accounts_train['username_other'] = accounts_train['username_char_len'] - (accounts_train['username_int'] + 
                                                                       accounts_train['username_char'])

# amount of numbers at the end of username if there is any
results = []

for i in accounts_train['username']:
    m = re.search(r'\d+$', i)
    if m is None:
        results.append(0)
    elif m is not None:
        count = m.group()
        results.append(len(count))

accounts_train['username_int_end'] = results

# amount of numbers in display name (not common for real users, but maybe for businesses!)
name_int = []    
for i in accounts_train['name']:
  if pd.isnull(i):
    name_int.append(np.NaN)
  else:
    numbers = sum(c.isdigit() for c in i)
    name_int.append(numbers)
accounts_train['name_int'] = name_int

# is there any cases of bots using techniques to make account details appear real, emoji flags, hashtags etc?
accounts_train.head()

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,class,date_created,days_active,followers_age,following_age,favourites_age,tweets_age,followers_ratio,username_char_len,name_ratio,username_int,username_char,username_other,username_int_end,name_int
0,418,Dennis Crowley,dens,NYC / Kingston,https://t.co/63fYABYs9J,"I like to build things (@Foursquare📱, @Stockad...",85422,2623,14990,69341,0.0,0.0,human,2006-07-05,5029,16.985882,0.521575,2.980712,13.788228,0.030706,4,3.5,0,4,0,0,0.0
1,586,Chris Sacca,sacca,The Rocky Mountains,,I love @crystale & our 3 girls. Used to invest...,1669241,1004,301974,71148,0.0,0.0,human,2006-07-13,5021,332.451902,0.19996,60.142203,14.170086,0.000601,5,2.2,0,5,0,0,0.0
2,8557,Japhy Grant,japhygrant,"Los Angeles, CA",https://t.co/jTIWv0ILjC,"👨‍🚀🌵Founder, #WonderValleyProjects. Had a hand...",3207,122,8901,15191,0.0,0.0,human,2006-10-11,4931,0.650375,0.024741,1.805111,3.080714,0.038042,10,1.1,0,10,0,0,0.0
3,12522,C.C. Chapman,cc_chapman,"Boston, MA",,Storyteller trying to leave it better than I f...,47372,22593,12468,139668,0.0,0.0,human,2006-11-15,4896,9.675654,4.614583,2.546569,28.526961,0.476927,10,1.2,0,9,1,0,0.0
4,15913,Larry Hryb,majornelson,In your Xbox,https://t.co/UNpCOu2HKp,"""The Xbox Guy"" 🎮 • 💑@thehappygirl • Twitter & ...",1127687,5,7,54804,0.0,0.0,human,2006-11-22,4889,230.658008,0.001023,0.001432,11.209654,4e-06,11,0.909091,0,11,0,0,0.0


In [0]:
print(accounts_train.info())
print(accounts_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19675 entries, 0 to 19674
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     19675 non-null  int64         
 1   name                   19673 non-null  object        
 2   username               19675 non-null  object        
 3   location               11553 non-null  object        
 4   url                    5646 non-null   object        
 5   description            12898 non-null  object        
 6   followers              19675 non-null  int64         
 7   friends                19675 non-null  int64         
 8   favourites_count       19675 non-null  int64         
 9   statuses_count         19675 non-null  int64         
 10  default_profile        12097 non-null  float64       
 11  default_profile_image  8185 non-null   float64       
 12  class                  19675 non-null  object        
 13  d

**Lets load in the additional features created using the recent tweets of each user**

Don't load in the actual tweets - 5000 + 18000 * 200 = a bit too much

In [0]:
# load in csvs of tweet related features aggregated by username which we can join without loading alot of tweets in

In [0]:
# # tweet_frequency for both datasets
# frequency_5000 = pd.read_csv(path_to_file + '5000_tweets_frequency.csv')
# #frequency_users = pd.read_csv(path_to_file + "training_tweets_frequency.csv")