In [1170]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [1171]:
df = pd.read_csv("Final datasets/model_data/users_climate_with_features.csv")
df2 = pd.read_csv("Final datasets/users_data/users_climate.csv")
df = df.iloc[:,1:]
verified = df2.iloc[:,[0,6]]
df = df.merge(verified,left_on='id',right_on='id')
df.verified = df.verified*1

In [1172]:
#df['url'] = df.url.notnull().astype('int')
# if we want to assume business/orgs/personalities dont have URLs remove 0's
#df = df[df['url'] == 1]
df.shape

(6834, 86)

### Base methodology to identify business/organisations and personalities within our dataset

This is an initial attempt at filtering out business/organisations and internet personalities from our data once bots have been removed. The aim is to use the characteristics of verified accounts in our dataset to identifiy those with similar features:
 - Run a logistic regression based on identifying verified users within our dataset (97% acc)
 - Using feature elimination to identify the significant features for verified users (currently at 12)
 - Identify clusters of accounts similar to those verified, currently using euclidean distance of each row as a 12x1 vector and set as 3 nearest for every verified account
 - We may be able to adopt the EM-algorithm for this job - the method seems to be picking out businesses but theres still some actual users 
 - The method depends on the amount of features selected, and the amount of nearest neighbours so we can tweak these 

In [1173]:
ratios = []

group = df.groupby('verified').mean()
for column in group:
    ratio = group[column][1]/group[column][0]
    ratios.append(ratio)
means = pd.DataFrame({'feature':group.columns.values,'dif':ratios})

In [1174]:
# what features differ the most between verified and non-verified users?
means = means.sort_values('dif',ascending=False)
means.head(10)

Unnamed: 0,feature,dif
75,followers_friends_ratio,90.950454
1,followers,89.541313
8,followers_age,38.075303
39,OGTratio,20.775923
38,favourite_countOT,13.79376
73,username_in_bio,4.031567
2,friends,2.17294
19,non_unique_tweets,1.974816
74,lower_userid,1.887393
71,username_in_urltitle,1.788263


In [1175]:
# username_in_urltitle has NaNs
top_features = list(means.iloc[:30,0])
top_features.extend(['verified','username'])
top_features

['followers_friends_ratio',
 'followers',
 'followers_age',
 'OGTratio',
 'favourite_countOT',
 'username_in_bio',
 'friends',
 'non_unique_tweets',
 'lower_userid',
 'username_in_urltitle',
 'statuses_count',
 'days_active',
 'average_hours_tweeted',
 'adj_OT',
 'nouns_OT',
 'nHW_OT',
 'w/urlR_OT',
 'StopW_OT',
 'verbs_OT',
 'bio_sentiment_positive',
 'TSW_OT',
 'alp_OT',
 'TCW_OT',
 'words_OT',
 'Tot_entities_OT',
 'ORGs_OT',
 'char_OT',
 'username_name_similarity',
 'tot_hashtags',
 'adv_OT',
 'verified',
 'username']

We can be pretty certain that those verified are not real people (personalities, organisations etc) - we can also see how some features appear to differ drastically for verified users, even the features that are based on followings are expected

In [1176]:
df.groupby('verified').mean()

Unnamed: 0_level_0,id,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,days_active,followers_age,following_age,...,hash_PT,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive
verified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.558795e+17,2506.374755,1668.161709,14744.589159,19765.274951,0.545825,0.045599,2242.252756,1.469184,1.382253,...,0.763678,0.315249,0.190566,0.589808,0.016458,0.441945,5.764521,0.145553,0.465499,0.388948
1,7.819968e+16,224424.085308,3624.815166,12902.004739,34952.57346,0.175355,0.0,3496.630332,55.939618,1.121926,...,0.828167,0.366714,0.340782,0.722738,0.066351,0.834123,524.285772,0.113744,0.379147,0.507109


In [1177]:
log_df = df.select_dtypes(exclude=['object'])
log_df = log_df[log_df.columns.drop(list(log_df.filter(regex='followers_fr')))]
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6834 entries, 0 to 6833
Data columns (total 79 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              6834 non-null   int64  
 1   followers                       6834 non-null   int64  
 2   friends                         6834 non-null   int64  
 3   favourites_count                6834 non-null   int64  
 4   statuses_count                  6834 non-null   int64  
 5   default_profile                 6834 non-null   int64  
 6   default_profile_image           6834 non-null   int64  
 7   days_active                     6834 non-null   int64  
 8   followers_age                   6834 non-null   float64
 9   following_age                   6834 non-null   float64
 10  favourites_age                  6834 non-null   float64
 11  tweets_age                      6834 non-null   float64
 12  username_char_len               68

In [1178]:
log_df = log_df.drop(columns=['username_in_urltitle','username_urltitle_simimlarity'])
log_df = log_df.dropna()
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6827 entries, 0 to 6833
Data columns (total 77 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              6827 non-null   int64  
 1   followers                       6827 non-null   int64  
 2   friends                         6827 non-null   int64  
 3   favourites_count                6827 non-null   int64  
 4   statuses_count                  6827 non-null   int64  
 5   default_profile                 6827 non-null   int64  
 6   default_profile_image           6827 non-null   int64  
 7   days_active                     6827 non-null   int64  
 8   followers_age                   6827 non-null   float64
 9   following_age                   6827 non-null   float64
 10  favourites_age                  6827 non-null   float64
 11  tweets_age                      6827 non-null   float64
 12  username_char_len               68

In [1179]:
ids = log_df.id.copy()
sum(ids.notnull())

6827

### Normalise the data for logisitic regression (y = verified)

In [1180]:
from sklearn import preprocessing
log_df = log_df.iloc[:,1:]
x = log_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
scaled_X= pd.DataFrame(x_scaled,columns=log_df.columns)
scaled_X

Unnamed: 0,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,days_active,followers_age,following_age,favourites_age,...,Gun_Index,tot_hashtags,hash_PT,username_name_similarity,username_in_bio,lower_userid,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive,verified
0,1.126858e-07,0.000400,0.000017,0.000015,1.0,1.0,0.185458,5.239113e-07,0.000835,2.861882e-05,...,0.205511,0.006276,0.039226,0.571429,0.0,0.0,0.0,1.0,0.0,0.0
1,2.839683e-05,0.001408,0.024337,0.013019,1.0,0.0,0.422908,5.824479e-05,0.001297,1.817070e-02,...,0.121889,0.009995,0.009995,0.193548,0.0,0.0,1.0,0.0,0.0,0.0
2,3.159711e-04,0.010726,0.016453,0.005876,0.0,0.0,0.665139,4.127720e-04,0.006293,7.824184e-03,...,0.129321,0.016272,0.016272,0.260870,0.0,1.0,0.0,1.0,0.0,1.0
3,7.888008e-07,0.000117,0.000016,0.002219,1.0,1.0,0.739841,9.266927e-07,0.000062,6.749451e-06,...,0.165230,0.002092,0.002092,0.600000,0.0,1.0,0.0,1.0,0.0,0.0
4,5.746977e-06,0.000830,0.001086,0.000374,1.0,0.0,0.010558,3.990957e-04,0.025888,2.744327e-02,...,0.142868,0.007903,0.007903,1.000000,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6822,2.622199e-04,0.016380,0.008494,0.001557,0.0,0.0,0.726096,3.138747e-04,0.008805,3.701229e-03,...,0.183012,0.024407,0.024407,0.416667,0.0,1.0,0.0,0.0,1.0,0.0
6823,4.507433e-06,0.000921,0.001097,0.000571,1.0,0.0,0.344622,1.133335e-05,0.001040,1.003954e-03,...,0.149550,0.028824,0.028824,0.315789,0.0,0.0,0.0,1.0,0.0,0.0
6824,2.366402e-06,0.000608,0.000001,0.003319,1.0,0.0,0.390438,5.255335e-06,0.000607,9.113472e-07,...,0.174604,0.013017,0.013017,0.235294,0.0,0.0,0.0,1.0,0.0,0.0
6825,4.056690e-06,0.000656,0.000295,0.000259,1.0,0.0,0.002191,8.451437e-04,0.061399,2.239918e-02,...,0.156527,0.023013,0.023013,0.615385,0.0,0.0,0.0,1.0,0.0,0.0


In [1181]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [1182]:
X = scaled_X.iloc[:,:-1]
y = scaled_X.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [1183]:
logreg= LogisticRegression(max_iter=1000)
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [1184]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
coefs = []

for i in list(range(0,len(X.columns)-1)):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

followers - 1.2115675832495028
friends - 0.32650609468936587
favourites_count - -0.30251437158983635
statuses_count - 0.13651433891836953
default_profile - -0.6444087572060325
default_profile_image - -0.8718014160170041
days_active - 2.536407940028958
followers_age - 1.5136625753750748
following_age - 0.13974863526622353
favourites_age - -0.16257506237968394
tweets_age - -0.012120443934380325
username_char_len - -0.14727823308901467
name_ratio - 1.4303717653273131
username_int - -0.7559577184565757
username_char - 0.41964140766596203
username_other - 0.29452223998207605
username_int_end - -0.998501664821338
name_int - 0.2519605636262277
non_unique_tweets - 0.027827138678166087
average_tweets_day - -0.9728034582834765
average_minutes_between_tweets - -0.14252057285859487
average_hours_tweeted - 3.0206318681470563
swears - -1.1612367460708122
polite - -0.22006061493487505
fourchan - -0.5846173488493205
log_swears - 0.7003683360280453
log_polite - -0.03691793190371291
log_fourchan - -0.17

In [1185]:
results = pd.DataFrame({'feature':X.columns[:-1],'coef':coefs})
results.sort_values('coef',ascending=False)

Unnamed: 0,feature,coef
21,average_hours_tweeted,3.020632
38,OGTratio,2.567659
6,days_active,2.536408
37,favourite_countOT,1.527102
7,followers_age,1.513663
...,...,...
40,ment_OT,-0.801908
5,default_profile_image,-0.871801
19,average_tweets_day,-0.972803
16,username_int_end,-0.998502


### Recursive feature elimination

In [1186]:
from sklearn.feature_selection import RFE

estimator = logreg
selector = RFE(estimator, 12, step=1)
selector = selector.fit(X, y)
rfe_list = list(selector.support_)

In [1187]:
X = X.loc[:,rfe_list]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [1188]:
logreg= LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [1189]:
coefs = []

for i in list(range(0,len(X.columns))):
    print(X.columns[i],"-",logreg.coef_[0,i])
    coefs.append(logreg.coef_[0,i])
    
print("") 
y_pred = cross_val_predict(logreg,X,y,cv=10)
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y, y_pred))    

days_active - 3.4426995502711986
followers_age - 1.915430673909039
username_int_end - -1.616636600563967
average_tweets_day - -1.4627215449932969
average_hours_tweeted - 3.139871319904155
swears - -1.4149926692749328
favourite_countOT - 1.5581852112304222
OGTratio - 2.6865723544369393
w/urlR_OT - 1.789586594049332
StopW_OT - 1.6642132650259531
nouns_OT - 1.9781719816971737
adj_OT - 1.1826982421675183

LogisticRegression accuracy is 0.971


### Cluster the data by distance to verified users as represented as a vector (12x1) for each account

In [1190]:
X['verified'] = y
X['ids'] = list(ids)

In [1191]:
from sklearn.neighbors import NearestNeighbors


X_verified = X.loc[X['verified']==1]
X_not_verified = X.loc[X['verified']==0]
X_verified = X_verified.iloc[:,:-2]
X_not_verified = X_not_verified.iloc[:,:-2]

nbrs = NearestNeighbors(n_neighbors=3, algorithm='brute').fit(X_not_verified)
distances, indices = nbrs.kneighbors(X_verified)

In [1192]:
result = pd.DataFrame({'indices':indices[:,0],'distances':distances[:,0],
                       'indices2':indices[:,1],'distances2':distances[:,1],
                       'indices3':indices[:,2],'distances3':distances[:,2],})
result = result.sort_values('distances')
result.head()

Unnamed: 0,indices,distances,indices2,distances2,indices3,distances3
18,6503,0.038494,514,0.077022,727,0.084319
152,3726,0.048038,5323,0.084453,3797,0.084678
25,4819,0.048861,3235,0.080346,3819,0.094346
117,3513,0.050478,6276,0.0864,4435,0.095844
114,6296,0.051244,4823,0.070348,4800,0.080457


In [1193]:
ind = 0
ranges = list(result.iloc[ind,[0,2,4]]) 
users = []

for i in ranges:

    link = X.iloc[int(i),-1]
    users.append(link)

info = df[df['id'].isin(users)]
info

Unnamed: 0,id,name,username,location,url,description,followers,friends,favourites_count,statuses_count,...,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment_negative,bio_sentiment_neutral,bio_sentiment_positive,verified
516,1110829371164684288,CAST,CAST_Centre,Cardiff and partners,https://t.co/nnXfva9V0X,We are the Centre for Climate Change and Socia...,1956,703,258,520,...,,,0.571429,0,0,2.782361,0,1,0,0
729,1184386025818337280,XR UK Citizens’ Assembly Working Group,CitizensXr,,https://t.co/Q6DdQo1WhE,Extinction Rebellion demands the UK Gov create...,788,187,192,434,...,0.363636,0.0,0.375,0,0,4.213904,1,0,0,0
6510,1248136856992206850,thelemur,thelemur6,,,"Mammal of the order Primates, divided into 8 f...",2,70,35,67,...,,,0.941176,0,0,0.028571,0,1,0,0


Visual inspection suggests (with outliers of course) that many of the businesses returned have urls in their bio, this seems a common theme and is not picked up by the regression model - so with the assumption that all business/organisations have a url in their bio (as why else would they have a social media account) we can identify businesses etc as those who are similar in key features to verified users **and** have a url in their bio.

We might be able to add other filters - such as default profile image etc depending on success or current method