In [263]:
# usual stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# searching algorithm
from sklearn.neighbors import NearestNeighbors

# performance
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE

# NLP
import preprocessor as p
from nltk.tokenize import word_tokenize

### 1) load in the data and the model results

In [264]:
# user data with all features
df = pd.read_csv("Final datasets/model_data/users_climate_with_features.csv")
# this has the 'verified' column
df2 = pd.read_csv("Final datasets/users_data/users_climate.csv")
# results of the bot detection model
df3 = pd.read_csv("Final datasets/tagged_data/bot_predict_test.csv")

# combine for what we need
df = df.iloc[:,1:]
verified = df2.iloc[:,[0,6]]
tagged = df3.iloc[:,[2,3]]
df = df.merge(verified,left_on='id',right_on='id')
df = df.merge(tagged,left_on='username',right_on='username')
df.verified = df.verified*1
df.shape

(6834, 87)

In [265]:
# some usernames appear twice - we'll exclude
df = df.drop_duplicates(subset=['username'])
df_final = df
df.shape

(6827, 87)

In [266]:
# remove bots and store their usernames
bots = df[df['predicted_class'] == 'bot'].copy()
bots = set(bots.username)
len(bots)

63

In [267]:
# drop bots from the working df
df = df[df['predicted_class']!='bot'].copy()
df = df.iloc[:,:-1]
df_orig = df
df.shape

(6764, 86)

### 2) Filter out most credible business/org/ngo accounts

Many accounts will be very clearly business/organisations/ngos and so we can remove them from the working df. The average twitter account has 707 followers, our average is ~9000 skewed by large accounts (Ny times etc) and so we will identify accounts with above average followings as *not* the accounts of real normal people.

Large followings may also influence the content posted and so for authentic thoughts we aren't looking at accounts with large audiences.

In [268]:
# drop accounts with higher than average followers - 9000 or so and skewed by big accounts
average_followers = df.followers.mean()

df = df[df['followers'] <= average_followers].copy()

print("Updated average:",df.followers.mean())
print(df.shape)

Updated average: 1100.9826488379497
(6282, 86)


EDA has revealed some common terms used in the bios of accounts related to the climate emergency domain. We also want to target collective pronouns so 'we' is included as an indicator of a personal twitter account

In [269]:
# remove accounts with domain specific pronouns/terms in bios
df['description'] = df['description'].str.casefold()
bios = []
terms = ['we','ngo','company','grassroots']

for item, frame in df['description'].iteritems():
    if pd.notnull(frame):
        clean = p.clean(frame)
        # token seperates tweets into lists of words
        token = word_tokenize(clean)
        if terms[0] in token or terms[1] in token or terms[2] in token or terms[3] in token:
            bios.append(1)
        else:
            bios.append(0)
    else:
        bios.append(0)
df['bio_pronouns'] = bios

In [270]:
# drop the bios
df = df[df['bio_pronouns'] != 1]
df = df.iloc[:,:-1]
df.shape

(5910, 86)

In [271]:
# see how many we've lost - saved them as a list for later
confirmed_org = set(df_orig.username) - set(df.username)
print(len(confirmed_org),"confirmed business, organisations or known personalities")

854 confirmed business, organisations or known personalities


In [272]:
# change column name for ratio
df.columns = ['popularity' if x=='followers_friends_ratio' else x for x in df.columns]

### 3) Base methodology to identify business/organisations and personalities within our dataset

This is an initial attempt at filtering out business/organisations and internet personalities from our data once bots have been removed. The aim is to use the characteristics of verified accounts in our dataset to identifiy those with similar features. Now that we have removed a large amount of obvious and popular businesses we can use the verified accounts that remain to identify other accounts (non-verified) who share similar features.

 - Run a logistic regression based on identifying verified users within our dataset (99% acc)
 - Using feature elimination to identify the significant features for verified users (currently at 12).
 - Identify clusters of accounts similar to those verified, currently using euclidean distance of each row as a 12x1 vector and set as 50 nearest for every verified account.
 - Collect all accounts who are within the 50 nearest neighbours for our verified accounts.
 - Assume those without a profile URL are real users - why would a business make an account without a URL?
 - Combine with our filtered businesses to create the 'org' tag
 
We may be able to adopt the EM-algorithm for this job - the method seems to be picking out businesses but theres still some actual users.


In [273]:
# lets have a look at those features who differ the most for verified/non-verified
ratios = []

group = df.groupby('verified').mean()
for column in group:
    ratio = group[column][1]/group[column][0]
    ratios.append(ratio)
means = pd.DataFrame({'feature':group.columns.values,'dif':ratios})

In [274]:
# comparing the averages 
df.groupby('verified').mean()

Unnamed: 0_level_0,id,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,days_active,followers_age,following_age,...,hash_PT,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,popularity,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive
verified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.537463e+17,1056.113288,1147.205622,13480.873083,16336.854514,0.560307,0.046167,2234.046848,0.828746,1.185531,...,0.726865,0.306109,0.178138,0.584687,0.013799,0.437138,1.907319,0.139012,0.482283,0.378705
1,1.336558e+17,3937.95,1469.9,6364.775,13679.525,0.325,0.0,2992.675,1.601172,0.654547,...,0.856215,0.254221,0.033333,0.67419,0.025,0.75,5.106493,0.075,0.4,0.525


In [275]:
# ranking
means = means.sort_values('dif',ascending=False)
means.head(10)

Unnamed: 0,feature,dif
1,followers,3.728719
39,OGTratio,3.223863
75,popularity,2.677314
38,favourite_countOT,2.602077
8,followers_age,1.932042
73,username_in_bio,1.811728
74,lower_userid,1.715705
19,non_unique_tweets,1.683086
18,name_int,1.497449
50,nHW_OT,1.401049


We can be pretty certain that those verified are not real people (personalities, organisations etc) - we can also see how some features appear to differ drastically for verified users, even the features that are based on followings are expected

In [276]:
# start prepping for regression
log_df = df.select_dtypes(exclude=['object'])
log_df = log_df[log_df.columns.drop(list(log_df.filter(regex='followers')))] # followers_frien
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910 entries, 0 to 6833
Data columns (total 78 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              5910 non-null   int64  
 1   friends                         5910 non-null   int64  
 2   favourites_count                5910 non-null   int64  
 3   statuses_count                  5910 non-null   int64  
 4   default_profile                 5910 non-null   int64  
 5   default_profile_image           5910 non-null   int64  
 6   days_active                     5910 non-null   int64  
 7   following_age                   5910 non-null   float64
 8   favourites_age                  5910 non-null   float64
 9   tweets_age                      5910 non-null   float64
 10  username_char_len               5910 non-null   int64  
 11  name_ratio                      5910 non-null   float64
 12  username_int                    59

In [277]:
# incomplete so we dont need these
log_df = log_df.drop(columns=['username_in_urltitle','username_urltitle_simimlarity'])
log_df = log_df.dropna()
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5894 entries, 0 to 6833
Data columns (total 76 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              5894 non-null   int64  
 1   friends                         5894 non-null   int64  
 2   favourites_count                5894 non-null   int64  
 3   statuses_count                  5894 non-null   int64  
 4   default_profile                 5894 non-null   int64  
 5   default_profile_image           5894 non-null   int64  
 6   days_active                     5894 non-null   int64  
 7   following_age                   5894 non-null   float64
 8   favourites_age                  5894 non-null   float64
 9   tweets_age                      5894 non-null   float64
 10  username_char_len               5894 non-null   int64  
 11  name_ratio                      5894 non-null   float64
 12  username_int                    58

In [278]:
# model data
ids = log_df.id.copy()
sum(ids.notnull())

5894

### 4) Normalise the data for logisitic regression (y = verified)

In [279]:
# many features so we'll normalise to assist convergence
log_df = log_df.iloc[:,1:]
x = log_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
scaled_X= pd.DataFrame(x_scaled,columns=log_df.columns)
scaled_X

Unnamed: 0,friends,favourites_count,statuses_count,default_profile,default_profile_image,days_active,following_age,favourites_age,tweets_age,username_char_len,...,tot_hashtags,hash_PT,username_name_similarity,username_in_bio,lower_userid,popularity,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive,verified
0,0.008875,0.000033,0.000022,1.0,1.0,0.185296,0.000832,0.000029,0.000048,0.181818,...,0.006276,0.039226,0.571429,0.0,0.0,0.000016,0.0,1.0,0.0,0.0
1,0.031500,0.047165,0.018803,1.0,0.0,0.422793,0.001294,0.018171,0.017349,0.454545,...,0.009995,0.009995,0.193548,0.0,0.0,0.001127,1.0,0.0,0.0,0.0
2,0.240589,0.031887,0.008486,0.0,0.0,0.665073,0.006290,0.007824,0.004987,0.090909,...,0.016272,0.016272,0.260870,0.0,1.0,0.001646,0.0,1.0,0.0,1.0
3,0.002536,0.000031,0.003205,1.0,1.0,0.739789,0.000059,0.000007,0.001693,0.272727,...,0.002092,0.002092,0.600000,0.0,1.0,0.000376,0.0,1.0,0.0,0.0
4,0.018529,0.002104,0.000540,1.0,0.0,0.010361,0.025885,0.027443,0.016922,0.454545,...,0.007903,0.007903,1.000000,0.0,0.0,0.000387,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,0.009655,0.000079,0.000049,1.0,0.0,0.015939,0.009381,0.000710,0.001088,0.272727,...,0.031381,0.092296,0.461538,0.0,0.0,0.001029,0.0,1.0,0.0,0.0
5890,0.367466,0.016462,0.002249,0.0,0.0,0.726041,0.008803,0.003701,0.001211,0.363636,...,0.024407,0.024407,0.416667,0.0,1.0,0.000895,0.0,0.0,1.0,0.0
5891,0.020577,0.002126,0.000824,1.0,0.0,0.344491,0.001038,0.001004,0.000932,0.818182,...,0.028824,0.028824,0.315789,0.0,0.0,0.000273,0.0,1.0,0.0,0.0
5892,0.014628,0.000572,0.000374,1.0,0.0,0.001992,0.061396,0.022399,0.035142,0.454545,...,0.023013,0.023013,0.615385,0.0,0.0,0.000345,0.0,1.0,0.0,0.0


In [280]:
# train test
X = scaled_X.iloc[:,:-1]
y = scaled_X.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [281]:
# fit the model (upped the iterations because f is high)
logreg= LogisticRegression(max_iter=1000)
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [282]:
# coefficient ranking would be a place to start
coefs = []

for i in list(range(0,len(X.columns)-1)):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 

# CV accuracy (10)
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

friends - -0.5506141183402873
favourites_count - -0.30463504887961074
statuses_count - -0.06057592858712894
default_profile - -0.4169222012955626
default_profile_image - -0.4633106364656112
days_active - 0.33563465441631923
following_age - -0.0497815248044303
favourites_age - -0.11201050709164914
tweets_age - -0.09029873899759848
username_char_len - -0.17512160093649498
name_ratio - -0.09824447980575357
username_int - -0.3924215819911859
username_char - 0.11732082268669378
username_other - 0.25583931960177625
username_int_end - -0.6263332489844541
name_int - 0.31159951679789805
non_unique_tweets - 0.04396251224839777
average_tweets_day - -0.7545889678111571
average_minutes_between_tweets - -0.05355224051701477
average_hours_tweeted - 0.41327035910978077
swears - -0.7571402713972397
polite - -0.7360755827442506
fourchan - 0.08094442500526657
log_swears - 0.5833861728089005
log_polite - -0.9350222381054313
log_fourchan - 0.1558153435292719
office_hours_utc - -1.0771432612245504
office_ho

In [283]:
results = pd.DataFrame({'feature':X.columns[:-1],'coef':coefs})
results.sort_values('coef',ascending=False)

Unnamed: 0,feature,coef
69,lower_userid,0.939517
39,w/urlR_OT,0.926328
50,adj_OT,0.925195
36,OGTratio,0.880841
31,tot_tweets,0.673037
...,...,...
21,polite,-0.736076
17,average_tweets_day,-0.754589
20,swears,-0.757140
24,log_polite,-0.935022


### 5) Recursive feature elimination to find our optimal coefs

In [284]:
# setup RFE

estimator = logreg
selector = RFE(estimator, 12, step=1)
selector = selector.fit(X, y)
rfe_list = list(selector.support_)

In [285]:
# run with our optimal features
X = X.loc[:,rfe_list]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [286]:
logreg= LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [287]:
coefs = []

for i in list(range(0,len(X.columns))):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

username_int_end - -0.8114740245808449
swears - -0.8978868490157045
polite - -0.7084664511632274
log_polite - -0.9112397272429051
tot_tweets - 0.7786853940900776
OGTratio - 0.9052166118483247
w/urlR_OT - 1.2997227020533195
TSW_OT - 1.0704885233052
StopW_OT - 0.9323407486951355
nouns_OT - 1.0506194035451293
adj_OT - 1.3154286901750127
lower_userid - 1.1517727770098574

LogisticRegression accuracy is 0.993


### 6) Cluster the data by distance to verified users as represented as a vector (12x1) for each account

The accuracy of our model doesn't matter too much. Considering the small amount of positives (verified) it won't be that difficult for a regression model to identify negatives. But we can use these key features for our searching algorithm

In [288]:
# just add in the truths and ids for joining
X['verified'] = y
X['ids'] = list(ids)
X

Unnamed: 0,username_int_end,swears,polite,log_polite,tot_tweets,OGTratio,w/urlR_OT,TSW_OT,StopW_OT,nouns_OT,adj_OT,lower_userid,verified,ids
0,0.0,0.000000,0.087413,0.476704,0.151515,0.001188,0.687500,0.387249,0.256057,0.297321,0.171089,0.0,0.0,914204469927260162
1,0.0,0.042254,0.090909,0.484106,1.000000,0.001267,1.000000,0.367169,0.352423,0.304762,0.234637,0.0,0.0,2588852462
2,0.0,0.000000,0.013986,0.130824,1.000000,0.086092,0.500000,0.284302,0.071346,0.282298,0.131771,1.0,1.0,257857728
3,0.0,0.000000,0.027972,0.261648,1.000000,0.000274,0.989529,0.269699,0.151763,0.214959,0.158677,1.0,0.0,114771707
4,0.0,0.098592,0.048951,0.367269,1.000000,0.004223,0.296296,0.233389,0.228830,0.196032,0.208204,0.0,0.0,1232099440346812417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,0.0,0.000000,0.226244,0.656190,0.333333,0.016560,0.970588,0.593926,0.459316,0.506723,0.437069,0.0,0.0,1222260400277688321
5890,0.0,0.014085,0.090909,0.484106,1.000000,0.025424,0.596026,0.387068,0.278612,0.366131,0.220134,1.0,0.0,137410730
5891,0.0,0.197183,0.139860,0.565412,1.000000,0.004220,0.346369,0.485781,0.341840,0.302634,0.264349,0.0,0.0,3390525093
5892,0.0,0.126761,0.069930,0.434588,1.000000,0.004836,0.360947,0.316494,0.268749,0.238884,0.171234,0.0,0.0,1247467214992748544


In [289]:
# we will use nearest neighbours to find those accounts with similar features to our verified 
X_verified = X.loc[X['verified']==1]
X_not_verified = X.loc[X['verified']==0]
X_verified = X_verified.iloc[:,:-2]
X_not_verified = X_not_verified.iloc[:,:-2]

# brute force just means it will literally take the distance between all pairs which is fine for our sample size
nbrs = NearestNeighbors(n_neighbors=50, algorithm='brute').fit(X_not_verified)
distances, indices = nbrs.kneighbors(X_verified)

In [290]:
# see what accounts are suspect
test = np.concatenate(indices,axis=0)
link = X.iloc[test,-1]
print(len(set(link)),"suspected business/orgs!")

test_df = df[df['id'].isin(link)]

1184 suspected business/orgs!


In [291]:
# account info for visual inspection 
pivot = pd.DataFrame({'ids':link,'count':1})
grouper = pivot.groupby('ids').sum()
grouper.sort_values('count')
test_df = test_df.merge(grouper,left_on='id',right_on=grouper.index)
test_df.shape

(1184, 87)

In [292]:
# add variable to df
suspected = []
for i in df.id:
    if i in set(link):
        suspected.append(1)
    else:
        suspected.append(0)
df['suspected_business'] = suspected

So we have x amount of suspected accounts - we are now making the assumption that those accounts who **don't** have a bio URL are not a business/organisation or NGO. This is more based on the idea that why would corporate social media accounts not contain a link to a site or other social media? Even @Twitter has a link...

![alt text](twitter_profile.png "Title")

In [293]:
# so lets drop those without and store as suspected_org
df_dropped = df.loc[(df.url.notnull() == True) & (df.suspected_business == 1)]
df_dropped.shape

(492, 87)

In [294]:
suspected_org = set(df_dropped.username)

In [295]:
# filter out whats left (our humans!)
df_new = df.drop(df[(df['url'].notnull()==True) & (df['suspected_business'] == 1)].index)
df_new

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,...,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,popularity,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive,verified,suspected_business
0,914204469927260162,LJ,007_lj,,,,1,92,15,32,...,,0.571429,0,0,0.010870,0,1,0,0,0
1,2588852462,💧 zero emissions noosa,0Thornton,"Queensland, Australia",,concerned for the future of our planet,252,324,21588,25374,...,,0.193548,0,0,0.777778,1,0,0,0,0
2,257857728,Michael Vincent 🇪🇺,0Vinz,Paris / Brussels / London / EU,https://t.co/Q8qIus4iO9,auteur «le banquier et le citoyen» ➡️ https://...,2804,2468,14595,11453,...,0.0,0.260870,0,1,1.136143,0,1,0,1,0
3,114771707,brian glennie,0briang,,,,7,27,14,4326,...,,0.600000,0,1,0.259259,0,1,0,0,0
4,1232099440346812417,0retrocap,0retrocap,,,one two three,51,191,963,731,...,,1.000000,0,0,0.267016,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6827,1222260400277688321,Zoë M.,zoemmsw,"Fullerton, CA",,change agent. csuf msw. #advocate,71,100,36,68,...,,0.461538,0,0,0.710000,0,1,0,0,0
6829,137410730,Suzanne Dhaliwal,zoozanne,,https://t.co/EjlJLZ1Erw,"climate justice campaigner - producer, writer,...",2327,3769,7535,3037,...,0.0,0.416667,0,1,0.617405,0,0,1,0,0
6830,3390525093,Zuzana,zscczptglobal,,,plant-based 🌱 enviromental&animal activist 🌍🐄💪🏻,40,212,973,1114,...,,0.315789,0,0,0.188679,0,1,0,0,0
6832,1247467214992748544,ZUKI,zukibites,,,oppose the #lockdown,36,151,262,506,...,,0.615385,0,0,0.238411,0,1,0,0,0


In [296]:
# just to check everything has gone through
humans = set(df_new.username)

In [297]:
print(len(confirmed_org))
print(len(suspected_org))
print(len(humans))

854
492
5418


In [298]:
hits = list(confirmed_org) + list(suspected_org) + list(humans)
print(len(hits))
print(len(set(df_orig.username)))

6764
6764


In [299]:
orgs = pd.DataFrame({'username':list(confirmed_org),'tag':'org'})
orgs_sus = pd.DataFrame({'username':list(suspected_org),'tag':'org'})
humans_ = pd.DataFrame({'username':list(humans),'tag':'human'})
bots = pd.DataFrame({'username':list(bots),'tag':'bot'})

final_df = pd.concat([orgs,orgs_sus,humans_,bots],ignore_index=True)
final_df

Unnamed: 0,username,tag
0,Bergeonline,org
1,SCCAWatCom,org
2,ActivistOrr,org
3,ncAPPPL,org
4,BeyondSport,org
...,...,...
6822,sensiblebot,bot
6823,resilienceproj2,bot
6824,augurisk,bot
6825,edy_miyashiro,bot


In [300]:
final_df.groupby('tag').count()

Unnamed: 0_level_0,username
tag,Unnamed: 1_level_1
bot,63
human,5418
org,1346


In [301]:
output = df_final.merge(final_df,left_on='username',right_on='username')

In [302]:
#output.to_csv("climate_tagged.csv")

Visual inspection suggests (with outliers of course) that many of the businesses returned have urls in their bio and high similarity with verified users, this seems a common theme and is not picked up by the regression model - so with the assumption that all business/organisations have a url in their bio (as why else would they have a social media account) we can identify businesses etc as those who are similar in key features to verified users **and** have a url in their bio.

We might be able to add other filters - such as default profile image etc depending on success or current method

It also seems to miss alot of small community accounts - but these are orgs with no wider online presence outside of Twitter an typcially low follower counts (similar to a normal person)