In [20]:
# usual stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# searching algorithm
from sklearn.neighbors import NearestNeighbors

# performance
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE

# NLP
import preprocessor as p
from nltk.tokenize import word_tokenize

### 1) load in the data and the model results

In [21]:
# user data with all features
df = pd.read_csv("Final datasets/model_data/users_5G_with_features.csv")
# results of the bot detection model
df2 = pd.read_csv("Final datasets/bot_tagged_data/bot_predict_5G.csv")

In [22]:
# combine for what we need
df = df.iloc[:,1:]
tagged = df2.iloc[:,[2,3]]
df = df.merge(tagged,how='left',left_on='username',right_on='username')
df.verified = df.verified*1
df.shape

(11340, 90)

In [23]:
# some usernames appear twice - we'll exclude
df = df.drop_duplicates(subset=['username'])
# for final output
df_final = df
df.shape

(11334, 90)

In [24]:
# remove bots and store their usernames
bots = df[df['predicted_class'] == 'bot'].copy()
bots = set(bots.username)
len(bots)

24

In [25]:
# drop bots from the working df
df = df[df['predicted_class']!='bot'].copy()
df = df.iloc[:,:-1]

# for later as well
df_orig = df
df.shape

(11310, 89)

### 2) Filter out most credible business/org/ngo accounts

Many accounts will be very clearly business/organisations/ngos and so we can remove them from the working df. The average twitter account has 707 followers, our average is ~9000 skewed by large accounts (Ny times etc) and so we will identify accounts with above average followings as *not* the accounts of real normal people.

Large followings may also influence the content posted and so for authentic thoughts we aren't looking at accounts with large audiences.

In [26]:
# drop accounts with higher than average followers - 9000 or so and skewed by big accounts
average_followers = df.followers.mean()
print("Average:", average_followers)
df = df[df['followers'] <= average_followers].copy()

print("Updated average:",df.followers.mean())
print(df.shape)

Average: 6655.702298850575
Updated average: 776.084377372817
(10536, 89)


EDA has revealed some common terms used in the bios of accounts related to the climate emergency domain. We also want to target collective pronouns so 'we' is included as an indicator of a personal twitter account

In [27]:
# remove accounts with domain specific pronouns/terms in bios
df['description'] = df['description'].str.casefold()
bios = []
terms = ['we','ngo','company','grassroots']

for item, frame in df['description'].iteritems():
    if pd.notnull(frame):
        clean = p.clean(frame)
        # token seperates tweets into lists of words
        token = word_tokenize(clean)
        if terms[0] in token or terms[1] in token or terms[2] in token or terms[3] in token:
            bios.append(1)
        else:
            bios.append(0)
    else:
        bios.append(0)
df['bio_pronouns'] = bios

In [28]:
# drop the bios
df = df[df['bio_pronouns'] != 1]
df = df.iloc[:,:-1]
df.shape

(10212, 89)

In [29]:
# see how many we've lost - saved them as a list for later
confirmed_org = set(df_orig.username) - set(df.username)
print(len(confirmed_org),"confirmed business, organisations or known personalities")

1098 confirmed business, organisations or known personalities


In [30]:
# change column name for ratio
df.columns = ['popularity' if x=='followers_friends_ratio' else x for x in df.columns]

### 3) Base methodology to identify business/organisations and personalities within our dataset

This is an initial attempt at filtering out business/organisations and internet personalities from our data once bots have been removed. The aim is to use the characteristics of verified accounts in our dataset to identifiy those with similar features. Now that we have removed a large amount of obvious and popular businesses we can use the verified accounts that remain to identify other accounts (non-verified) who share similar features.

 - Run a logistic regression based on identifying verified users within our dataset (99% acc)
 - Using feature elimination to identify the significant features for verified users (currently at 12).
 - Identify clusters of accounts similar to those verified, currently using euclidean distance of each row as a 12x1 vector and set as 50 nearest for every verified account.
 - Collect all accounts who are within the 50 nearest neighbours for our verified accounts.
 - Assume those without a profile URL are real users - why would a business make an account without a URL?
 - Combine with our filtered businesses to create the 'org' tag
 
We may be able to adopt the EM-algorithm for this job - the method seems to be picking out businesses but theres still some actual users.


In [31]:
# lets have a look at those features who differ the most for verified/non-verified
ratios = []

group = df.groupby('verified').mean()
for column in group:
    ratio = group[column][1]/group[column][0]
    ratios.append(ratio)
means = pd.DataFrame({'feature':group.columns.values,'dif':ratios})

In [32]:
# comparing the averages 
df.groupby('verified').count()

Unnamed: 0_level_0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,...,hash_PT,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,popularity,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive
verified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10175,10173,10175,6972,3056,8541,10175,10175,10175,10175,...,10154,1933,1933,6019,6019,6019,5824,10175,10175,10175
1,37,37,37,34,32,37,37,37,37,37,...,37,28,28,34,34,34,34,37,37,37


In [33]:
# ranking
means = means.sort_values('dif',ascending=False)
means.head(10)

Unnamed: 0,feature,dif
1,followers,5.464861
42,OGTratio,2.90263
25,favourite_count,2.836583
79,bio_sentiment_negative,2.115385
78,popularity,2.011863
2,friends,1.999211
8,followers_age,1.93863
4,statuses_count,1.894163
7,days_active,1.700241
80,bio_sentiment_neutral,1.531086


We can be pretty certain that those verified are not real people (personalities, organisations etc) - we can also see how some features appear to differ drastically for verified users, even the features that are based on followings are expected

In [34]:
# start prepping for regression
log_df = df.select_dtypes(exclude=['object'])
log_df = log_df[log_df.columns.drop(list(log_df.filter(regex='followers')))] # followers_frien
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10212 entries, 0 to 11339
Data columns (total 81 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              10212 non-null  float64
 1   verified                        10212 non-null  int64  
 2   friends                         10212 non-null  int64  
 3   favourites_count                10212 non-null  int64  
 4   statuses_count                  10212 non-null  int64  
 5   default_profile                 10212 non-null  int64  
 6   default_profile_image           10212 non-null  int64  
 7   days_active                     10212 non-null  int64  
 8   following_age                   10212 non-null  float64
 9   favourites_age                  10212 non-null  float64
 10  tweets_age                      10212 non-null  float64
 11  username_char_len               10212 non-null  int64  
 12  name_ratio                      

In [35]:
# incomplete so we dont need these
log_df = log_df.drop(columns=['username_in_urltitle','username_urltitle_simimlarity'])
log_df = log_df.drop([log_df.columns[72],log_df.columns[73],log_df.columns[74],log_df.columns[75],log_df.columns[76],log_df.columns[77]],axis='columns')
log_df = log_df.dropna()
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10180 entries, 0 to 11339
Data columns (total 73 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              10180 non-null  float64
 1   verified                        10180 non-null  int64  
 2   friends                         10180 non-null  int64  
 3   favourites_count                10180 non-null  int64  
 4   statuses_count                  10180 non-null  int64  
 5   default_profile                 10180 non-null  int64  
 6   default_profile_image           10180 non-null  int64  
 7   days_active                     10180 non-null  int64  
 8   following_age                   10180 non-null  float64
 9   favourites_age                  10180 non-null  float64
 10  tweets_age                      10180 non-null  float64
 11  username_char_len               10180 non-null  int64  
 12  name_ratio                      

In [36]:
# model data
ids = log_df.id.copy()
sum(ids.notnull())

10180

### 4) Normalise the data for logisitic regression (y = verified)

In [37]:
# many features so we'll normalise to assist convergence
log_df = log_df.iloc[:,1:]
x = log_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
scaled_X= pd.DataFrame(x_scaled,columns=log_df.columns)
scaled_X

Unnamed: 0,verified,friends,favourites_count,statuses_count,default_profile,default_profile_image,days_active,following_age,favourites_age,tweets_age,...,pStopW_OT,pnouns_OT,padj_OT,pverbs_OT,padv_OT,ppron_OT,Gun_Index,tot_hashtags,hash_PT,bio_sentiment_positive
0,0.0,0.020913,0.001523,0.000725,1.0,0.0,0.011401,0.023114,0.021961,0.015647,...,0.615776,0.245029,0.096613,0.407318,0.216510,0.270300,0.153624,0.017363,0.017450,0.0
1,0.0,0.000400,0.000778,0.000214,1.0,0.0,0.647394,0.000009,0.000229,0.000094,...,0.667429,0.267583,0.109357,0.426901,0.208967,0.216864,0.202547,0.015385,0.015385,0.0
2,0.0,0.037723,0.019455,0.001589,1.0,0.0,0.145358,0.003748,0.025214,0.003080,...,0.727797,0.267506,0.109827,0.512139,0.295954,0.279410,0.151716,0.020659,0.020659,0.0
3,0.0,0.026016,0.010790,0.001578,0.0,0.0,0.571661,0.000663,0.003589,0.000785,...,0.659917,0.295754,0.109989,0.404167,0.324479,0.137338,0.222640,0.000220,0.000220,1.0
4,0.0,0.184811,0.094952,0.048698,0.0,0.0,0.751425,0.003588,0.024047,0.018435,...,0.464603,0.192439,0.073210,0.353316,0.129443,0.207008,0.191665,0.001978,0.001988,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10175,0.0,0.055433,0.005258,0.002633,1.0,0.0,0.154112,0.005199,0.006432,0.004816,...,0.628288,0.223986,0.072854,0.596806,0.306055,0.406923,0.125263,0.001978,0.001978,0.0
10176,0.0,0.051531,0.000966,0.000507,0.0,0.0,0.481270,0.001560,0.000382,0.000299,...,0.774202,0.279799,0.114917,0.465748,0.249696,0.237818,0.131125,0.000659,0.000659,0.0
10177,0.0,0.003202,0.005174,0.002321,1.0,0.0,0.161645,0.000286,0.006038,0.004050,...,0.750188,0.258407,0.119785,0.445158,0.227298,0.190173,0.228315,0.000879,0.000879,0.0
10178,0.0,0.017911,0.022056,0.001415,1.0,0.0,0.068607,0.003719,0.059732,0.005732,...,0.745565,0.279788,0.107020,0.494468,0.337276,0.286141,0.146541,0.000000,0.000000,0.0


In [38]:
# train test
X = scaled_X.iloc[:,1:]
y = scaled_X.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [39]:
# fit the model (upped the iterations because f is high)
logreg= LogisticRegression(max_iter=1000)
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
# coefficient ranking would be a place to start
coefs = []

for i in list(range(0,len(X.columns)-1)):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 

# CV accuracy (10)
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

friends - 1.0571103094944976
favourites_count - -0.04398674140117255
statuses_count - 0.15106954523812036
default_profile - -0.6051120137305005
default_profile_image - -0.4169524369087506
days_active - 1.9476592425283885
following_age - -0.03924361772137932
favourites_age - -0.16637931305708994
tweets_age - 0.002566416929938712
username_char_len - 0.1785509562926145
name_ratio - -0.14287330054634154
username_int - -0.7213108546687254
username_char - 0.7670196331893508
username_other - -0.13899689985915267
username_int_end - -0.6462438040693672
name_int - -0.22720820621048057
non_unique_tweets - -0.17782870607487813
average_tweets_day - -0.5915371297904635
average_minutes_between_tweets - -0.014448475718815275
average_hours_tweeted - 0.8523736799841825
tweetid - 0.20778510266127848
retweet_count - -0.14509418018405762
favourite_count - 0.1338698004214858
swears - -0.31177423507046315
polite - 0.257032871192424
fourchan - 0.013586933934602507
log_swears - -0.8345105459568997
log_polite -

In [41]:
results = pd.DataFrame({'feature':X.columns[:-1],'coef':coefs})
results.sort_values('coef',ascending=False)

Unnamed: 0,feature,coef
5,days_active,1.947659
61,pStopW_OT,1.242576
0,friends,1.057110
19,average_hours_tweeted,0.852374
12,username_char,0.767020
...,...,...
17,average_tweets_day,-0.591537
3,default_profile,-0.605112
14,username_int_end,-0.646244
11,username_int,-0.721311


### 5) Recursive feature elimination to find our optimal coefs

In [42]:
# setup RFE

estimator = logreg
selector = RFE(estimator, 12, step=1)
selector = selector.fit(X, y)
rfe_list = list(selector.support_)

In [43]:
# run with our optimal features
X = X.loc[:,rfe_list]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [44]:
logreg= LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
coefs = []

for i in list(range(0,len(X.columns))):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

friends - 1.1447573292214348
days_active - 2.364292034102041
username_int - -0.9273591319382776
username_char - 0.9699208620052506
log_swears - -0.829537915075584
log_polite - 0.7266491663336396
office_hours_aus - 0.7552141588288608
w/urlR_OT - 0.8163098625894485
StopW_OT - 1.0794962729361273
nouns_OT - 0.9492231918152106
adj_OT - 0.9088317060572126
pStopW_OT - 1.3261330080934364

LogisticRegression accuracy is 0.996


### 6) Cluster the data by distance to verified users as represented as a vector (12x1) for each account

The accuracy of our model doesn't matter too much. Considering the small amount of positives (verified) it won't be that difficult for a regression model to identify negatives. But we can use these key features for our searching algorithm

In [46]:
# just add in the truths and ids for joining
X['verified'] = y
X['ids'] = list(ids)
X

Unnamed: 0,friends,days_active,username_int,username_char,log_swears,log_polite,office_hours_aus,w/urlR_OT,StopW_OT,nouns_OT,adj_OT,pStopW_OT,verified,ids
0,0.020913,0.011401,0.272727,0.400000,0.499040,0.339121,0.392,0.358696,0.292500,0.179369,0.166323,0.615776,0.0,1.230000e+18
1,0.000400,0.647394,0.272727,0.200000,0.392472,0.511115,0.310,0.257576,0.228514,0.141187,0.135696,0.667429,0.0,3.489127e+08
2,0.037723,0.145358,0.181818,0.400000,0.498093,0.303764,0.570,0.403061,0.254670,0.144254,0.139280,0.727797,0.0,9.930000e+17
3,0.026016,0.571661,0.181818,0.400000,0.303764,0.434588,0.405,0.452261,0.551018,0.380570,0.332842,0.659917,0.0,7.515698e+08
4,0.184811,0.751425,0.181818,0.266667,0.435534,0.304710,0.377,0.129032,0.186663,0.119150,0.106600,0.464603,0.0,1.222358e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10175,0.055433,0.154112,0.000000,0.400000,1.000000,0.303764,0.455,0.187500,0.141805,0.077908,0.059594,0.628288,0.0,9.780000e+17
10176,0.051531,0.481270,0.000000,0.866667,0.434588,0.367269,0.415,0.361809,0.253662,0.141278,0.136458,0.774202,0.0,2.161878e+09
10177,0.003202,0.161645,0.090909,0.400000,0.338175,0.434588,0.165,0.000000,0.533113,0.282997,0.308507,0.750188,0.0,9.640000e+17
10178,0.017911,0.068607,0.000000,0.533333,0.763082,0.414702,0.655,0.215054,0.277668,0.160582,0.144451,0.745565,0.0,1.130000e+18


In [47]:
# we will use nearest neighbours to find those accounts with similar features to our verified 
X_verified = X.loc[X['verified']==1]
X_not_verified = X.loc[X['verified']==0]
X_verified = X_verified.iloc[:,:-2]
print(len(X_verified))
X_not_verified = X_not_verified.iloc[:,:-2]

# brute force just means it will literally take the distance between all pairs which is fine for our sample size
nbrs = NearestNeighbors(n_neighbors=50, algorithm='brute').fit(X_not_verified)
distances, indices = nbrs.kneighbors(X_verified)

37


In [48]:
# see what accounts are suspect
test = np.concatenate(indices,axis=0)
link = X.iloc[test,-1]
print(len(set(link)),"suspected business/orgs!")

test_df = df[df['id'].isin(link)]

1037 suspected business/orgs!


In [49]:
# account info for visual inspection 
pivot = pd.DataFrame({'ids':link,'count':1})
grouper = pivot.groupby('ids').sum()
grouper.sort_values('count')
test_df = test_df.merge(grouper,left_on='id',right_on=grouper.index)
test_df.shape

(4487, 90)

In [50]:
# add variable to df
suspected = []
for i in df.id:
    if i in set(link):
        suspected.append(1)
    else:
        suspected.append(0)
df['suspected_business'] = suspected

So we have x amount of suspected accounts - we are now making the assumption that those accounts who **don't** have a bio URL are not a business/organisation or NGO. This is more based on the idea that why would corporate social media accounts not contain a link to a site or other social media? Even @Twitter has a link...

In [51]:
# so lets drop those without and store as suspected_org
df_dropped = df.loc[(df.url.notnull() == True) & (df.suspected_business == 1)]
df_dropped.shape

(1041, 90)

In [52]:
suspected_org = set(df_dropped.username)

In [53]:
df_verified = df[df['verified']==1]
df_verified.shape

(37, 90)

In [54]:
verified_org = set(df_verified.username)

In [55]:
# filter out whats left (our humans!)
df_new = df.drop(df[(df['url'].notnull()==True) & (df['suspected_business'] == 1)].index)
df_new = df_new[df_new['verified']==0].copy()

In [56]:
sum(df.verified.isnull())

0

In [57]:
# just to check everything has gone through
humans = set(df_new.username)

In [58]:
print(len(confirmed_org))
print(len(suspected_org))
print(len(verified_org))
print(len(humans))
print(len(bots))

1098
1041
37
9139
24


In [59]:
hits = list(confirmed_org) + list(suspected_org) + list(humans) + list(verified_org)
print(len(hits))
print(len(set(df_orig.username)))

# five unnaccounted for!?

11315
11310


In [60]:
orgs = pd.DataFrame({'username':list(confirmed_org),'tag':'org'})
orgs_sus = pd.DataFrame({'username':list(suspected_org),'tag':'org'})
orgs_verified = pd.DataFrame({'username':list(verified_org),'tag':'org'})
humans_ = pd.DataFrame({'username':list(humans),'tag':'human'})
bots = pd.DataFrame({'username':list(bots),'tag':'bot'})

final_df = pd.concat([orgs,orgs_sus,orgs_verified,humans_,bots],ignore_index=True)

In [61]:
final_df.groupby('tag').count()

Unnamed: 0_level_0,username
tag,Unnamed: 1_level_1
bot,24
human,9139
org,2176


In [62]:
output = df_final.merge(final_df,left_on='username',right_on='username')
output.head()

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,...,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive,predicted_class,tag
0,1.23e+18,jaleel appleseed 007,007Jaleel,,https://t.co/4zFqDPbEKL,Ginger Golem Goulish Goyish Soulful Murdering ...,0,82,209,711,...,,,,,,0,0,0,human,org
1,348912700.0,BAK,009BAK,"Chiang Mai, Thailand",,I am here to take what you say and feeded back...,0,116,4,363,...,,0.666667,0.0,1.0,29.0,0,1,0,human,human
2,9.93e+17,00TurboK aka Juan Wick,00TurboK,,,"Pokemon GO player 🔥Valor🔥 First of his name, B...",0,138,377,9080,...,,,,,,0,0,0,human,human
3,751569800.0,ravi patel,00akshar,,,as allways-perfect in this world n in every......,0,28,260,5036,...,,0.222222,0.0,1.0,0.107692,0,0,1,human,human
4,4697876000.0,F’n Idiots Everywhere,0NoMyProfile,Everywhere,,World is doomed if we have to rely on the rest...,0,45,458,1111,...,,0.30303,0.0,0.0,0.098253,1,0,0,human,org


In [63]:
#output.to_csv("Final datasets/org_tagged_data/5G_tagged.csv")