In [1]:
import pandas as pd
pd.set_option('use_inf_as_na', True)
import numpy as np
import re, math
from collections import Counter
from nltk.corpus import stopwords # pip install nltk (nlp toolkit)
from nltk.stem.porter import *
#nltk.download('wordnet') - https://www.nltk.org/howto/wordnet.html
from nltk.corpus import wordnet as wn
stop = stopwords.words('english')

In [2]:
# functions for get_similarity etc - included but can improve these
WORD = re.compile(r'\w+')
stemmer = PorterStemmer()

def get_cosine(vec1, vec2):
    # print vec1, vec2
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    a = []
    for i in words:
        for ss in wn.synsets(i):
            a.extend(ss.lemma_names())
    for i in words:
        if i not in a:
            a.append(i)
    a = set(a)
    w = [stemmer.stem(i) for i in a if i not in stop]
    return Counter(w)

def get_similarity(a, b):
    a = text_to_vector(a.strip().lower())
    b = text_to_vector(b.strip().lower())

    return get_cosine(a, b)

def get_char_wise_similarity(a, b):
    a = text_to_vector(a.strip().lower())
    b = text_to_vector(b.strip().lower())
    s = []

    for i in a:
        for j in b:
            s.append(get_similarity(str(i), str(j)))
    try:
        return sum(s)/float(len(s))
    except: # len(s) == 0
        return 0

**Develop bot detection features**

In [7]:
# load in the 5000 random samples
accounts_df = pd.read_csv("5000_accounts_climate.csv")
accounts_df.head()

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,statuses_count,created_at,default_profile,default_profile_image
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,False,430,291,14866,6039,2019-02-22 04:36:05,True,False
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",False,4643,4904,257039,41696,2010-09-16 10:30:21,True,False
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,False,0,3,1,8,2019-10-18 12:14:42,True,True
3,1220868357647368193,Noles-4-Life-In-SC,Noles4LifeInSC,,,Florida native living in Upstate South Carolin...,False,628,630,1852,1862,2020-01-25 00:39:20,True,False
4,347142932,Life Cycle UK,LifeCycleUKteam,Bristol,http://t.co/oCrHfe7xAg,Life Cycle UK is a Bristol-based charity that ...,False,3105,1645,3039,5349,2011-08-02 10:21:33,False,False


In [4]:
accounts_df['default_profile'] = accounts_df['default_profile']*1
accounts_df['default_profile_image'] = accounts_df['default_profile_image']*1

In [5]:
# days since account has been open
accounts_df['date_created'], accounts_df['time'] = accounts_df['created_at'].str.split(' ', 1).str
accounts_df['date_created'] = pd.to_datetime(accounts_df.date_created, format='%Y-%m-%d')
accounts_df['day'] = ('2020-03-12')
accounts_df['day'] = pd.to_datetime(accounts_df.day, format='%Y-%m-%d')
accounts_df['days_active'] = (accounts_df['day'] - accounts_df['date_created']).dt.days

if 'day' in accounts_df:
    accounts_df = accounts_df.drop(columns=['time','day','created_at'])

else:
    accounts_df.head()

In [6]:
# social figures in relation to account age
accounts_df['followers_age'] = accounts_df['followers'] / accounts_df['days_active']
accounts_df['following_age'] = accounts_df['friends'] / accounts_df['days_active']
accounts_df['favourites_age'] = accounts_df['favourites_count'] / accounts_df['days_active']
accounts_df['tweets_age'] = accounts_df['statuses_count'] / accounts_df['days_active']

# following to follower ratio
accounts_df['followers_ratio'] = accounts_df['friends'] / accounts_df['followers']

accounts_df.head(3)

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,statuses_count,default_profile,default_profile_image,date_created,days_active,followers_age,following_age,favourites_age,tweets_age,followers_ratio
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,False,430,291,14866,6039,1,0,2019-02-22,384,1.119792,0.757812,38.713542,15.726562,0.676744
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",False,4643,4904,257039,41696,1,0,2010-09-16,3465,1.339971,1.415296,74.18153,12.033478,1.056214
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,False,0,3,1,8,1,1,2019-10-18,146,0.0,0.020548,0.006849,0.054795,


In [7]:
# username features - can only use A-Z, 0-9 and _ + not case-sensitive
accounts_df['username_char_len'] = accounts_df['username'].str.len()

# length of username in comparison to length of real name
accounts_df['name_ratio'] = accounts_df['name'].str.len() / accounts_df['username'].str.len()
    # amount of numbers in username
username_int = []    
for i in accounts_df['username']:
    numbers = sum(c.isdigit() for c in i)
    username_int.append(numbers)
accounts_df['username_int'] = username_int

    # amount of characters
username_char = []    
for i in accounts_df['username']:
    char = sum(c.isalpha() for c in i)
    username_char.append(char)
accounts_df['username_char'] = username_char

    # underscores 
accounts_df['username_other'] = accounts_df['username_char_len'] - (accounts_df['username_int'] + 
                                                                       accounts_df['username_char'])

# amount of numbers at the end of username if there is any
results = []

for i in accounts_df['username']:
    m = re.search(r'\d+$', i)
    if m is None:
        results.append(0)
    elif m is not None:
        count = m.group()
        results.append(len(count))

accounts_df['username_int_end'] = results

# amount of numbers in display name (not common for real users, but maybe for businesses!)
name_int = []    
for i in accounts_df['name']:
    numbers = sum(c.isdigit() for c in i)
    name_int.append(numbers)
accounts_df['name_int'] = name_int

# is there any cases of bots using techniques to make account details appear real, emoji flags, hashtags etc?
accounts_df.head()

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,...,favourites_age,tweets_age,followers_ratio,username_char_len,name_ratio,username_int,username_char,username_other,username_int_end,name_int
0,1098803589609189376,💧The Cranky Croation,JohnSarich2,,,My First ever vote was for Gough Whitlam. Left...,False,430,291,14866,...,38.713542,15.726562,0.676744,11,1.818182,1,10,0,1,0
1,191393940,FranS #RejoinEU #ElectoralReform 🇬🇧🇪🇺🇩🇪,FranS199,Earth,,"Love good food, good wine, great company. Hate...",False,4643,4904,257039,...,74.18153,12.033478,1.056214,8,4.875,3,5,0,3,0
2,1185167241819676673,mbnvcxz,mbnvcxz2,,,,False,0,3,1,...,0.006849,0.054795,,8,0.875,1,7,0,1,0
3,1220868357647368193,Noles-4-Life-In-SC,Noles4LifeInSC,,,Florida native living in Upstate South Carolin...,False,628,630,1852,...,39.404255,39.617021,1.003185,14,1.285714,1,13,0,0,1
4,347142932,Life Cycle UK,LifeCycleUKteam,Bristol,http://t.co/oCrHfe7xAg,Life Cycle UK is a Bristol-based charity that ...,False,3105,1645,3039,...,0.966296,1.700795,0.529791,15,0.866667,0,15,0,0,0


In [8]:
get_similarity('I am a good boy', 'I am a very disciplined guy')
# Returns 0.5491201525567068

0.5417490779798924

In [9]:
# need to find a way of improving this method ...
def remove_emojis(string):
    return string.encode('ascii', 'ignore').decode('ascii')

username_name_sim = []

for i,j in zip(accounts_df['username'],accounts_df['name']):
    j = remove_emojis(j)
    clean_name = []
    for k in j.split():
        if k[0] != '#':
            clean_name.append(k)
    l =  " ".join(clean_name)
    #print(i,"-",l)
    score = get_similarity(i,l)
    print(i,"-",l,"-",score)
    username_name_sim.append(score)
#print(sum(username_name_sim)/len(username_name_sim))

JohnSarich2 - The Cranky Croation - 0.0
FranS199 - FranS - 0.0
mbnvcxz2 - mbnvcxz - 0.0
Noles4LifeInSC - Noles-4-Life-In-SC - 0.0
LifeCycleUKteam - Life Cycle UK - 0.0
femmekatz - Kathe Garbrick - 0.0
itsmarthabrady - Martha Brady - 0.0
kjellrot - Kjell Granrot - 0.0
glandwrwj - Wayne J - 0.0
jkahonen - Kristian Ahonen - 0.0
KleanIndustries - Klean Industries - 0.0
Villain1982 - Villain - 0.0
mgriff3456 - Thomas Griffin - 0.0
SustransNI - Sustrans NI - 0.0
RemainRevoke - Rejoin & rebuild? - 0.0
JillFuglister - Jill Fuglister - 0.0
SeniyaAK - SENIYA A - 0.0
pel_reece - Paul Elliot Reece - 0.0
simpson_d01 - David Simpson - 0.0
jo_head15 - Jo Head - 0.0
chris_griffith - Chris Griffith - 0.0
ertimus - Ertimus - 1.0
PaulMcM78 - PaulMac GCHQ - 0.0
THE_OTHER_BAJO - Blah blah blah - 0.0
CThornthwaite - Cris Harrison - 0.0
CGSBucks - Annmarie McNaney - 0.0
dsgold - David Goldsmith - 0.0
AshleyGunstock - Ashley Gunstock - 0.0
RIBABristolBath - RIBA Bristol & Bath - 0.0
shelleypetersen - Michelle

bmpermie - Pat R - 0.0
rachelathecoast - rachel howard - 0.0
grevillemills - Greville Mills - 0.0
Hotchillienema - Richard Anderson - 0.0
MonHublot - Victoria Emslie - 0.0
KimDriver11 - Kim Driver - 0.0
contentilo - Contentilo - 1.0
alexforeurope - Councillor Alexandra Phillips - 0.0
soorips - soorips- Starving koalas still being rescued on KI - 0.10101525445522107
alisonclareteal - Alison Teal - 0.0
philhar75151850 - phil harrison - 0.0
MShiltonGodwin - MandieShiltonGodwin - 0.0
garyfoskett - Clapton Blues - 0.0
anoellemartin - Abigail Noelle Martin - 0.0
BWCE - B&WCommunityEnergy - 0.0
gay4film - The Mandalesbian - 0.0
theblackeffect7 - stephen black - 0.0
JamesDe34061167 - Doc Jamie - 0.0
tonyhandley18 - tonyhandley - 0.0
EmmaHowardBoyd - Emma Howard Boyd - 0.0
1_bstevens - Ben Stevens - 0.0
ANonsens - Anthony J - 0.0
transitionbux - Transition Buxton - 0.0
NorthernGather2 - NorthernGatherer - 0.0
JaneBea5 - Jane Bee - 0.0
GoGreenae - Go Green from Cyber Gear - 0.0
1Paul_Burrows - D

WoodsHoleResCtr - Woods Hole Research Center - 0.0
sunpath2 - Black Joan - 0.0
CrystalSeahorse - S - 0.0
ianrkenilworth - Ian Rose - 0.0
JackEmanuel14 - Jack Emanuel - 0.0
ChesterUniBRI - BRI - 0.0
RadiHood - Radi Hood - 0.0
GemKPea - Dr. Gemma Pearson - 0.0
FFFChicago - Fridays For Future Chicago - 0.0
CarbonConvoCA - Carbon Conversations - 0.0
AmyDavidsen - Amy Davidsen - 0.0
GillSilvia - Silvia Gill - 0.0
chop_carry - ChopWoodCarryWater - 0.0
Ben87770052 - Ben - 0.0
Lauratobin1 - Laura Tobin - 0.0
TysonAdams1 - Tyson Adams - 0.0
Animal_Watch - Animal Watch - 0.0
toorad4u__ - toorad4u - 0.0
bkshittu - Buki Shittu-Muideen - 0.0
jpomallez - Sen O'Malle - 0.0
52DNorth - Joe Hurdman - 0.0
circleofrageuk - Circle of Rage - 0.0
CoastProtectors - Coast Protectors - 0.0
CeliaEGreen - Celia Green - 0.0
sunraysiadaily - Sunraysia Daily - 0.0
pkeillerauthor - Patricia Keiller - 0.0
Exoticstonewood - Exotic Stonewood Flooring - 0.0
MattThomson42 - Matt Thomson - 0.0
Zealandian - Lets Roll - 0.0


cllrjonhubbard - Jon Hubbard - 0.0
HarshadTambe - Harshad Tambe - 0.0
opalnova - opalnova - 1.0
ashbull1 - Ash - 0.0
SMeinrath - Steven Meinrath - 0.0
kate_edmonds_EU - Kate Edmonds In Mourning - 0.0
B778 - Bernard - 0.0
EUflagmafia - EU Flag Mafia - 0.0
Jessiiee2003 - Jessi Kelly - 0.0
cat_daisy - daisy cat - 0.0
stephaniebynum - Credulous Boomer Rube Demographic - 0.0
LucyOB83 - Lucy O'Brien - 0.0
Datahopa - Renewable Future - 0.0
Dribelo1 - Olof - 0.0
DavidTi25827615 - David Tibbs - 0.0
Bec_Hubbard - Bec Hubbard - 0.0
rozgab - Gabriel Rozenberg - 0.0
CityofLdnOnt - City of London - 0.0
Elainecoates101 - Elaine Coates - 0.0
nikki_dones - Nikki Dones - 0.0
vansgirl12 - Margot Paez - 0.0
KirkleesCouncil - Kirklees Council - 0.0
rndm_tht - Random Thoughts - 0.0
Janannemorris - Jan Morris - 0.0
EsTresidder - Es Tresidder - 0.0
J5vShout - John Valentine - 0.0
abline11 - Andy Brown - 0.0
timpwalker - Tim Walker - 0.0
riotintosuknews - Robert McLaughlin - 0.0
weldangel2 - Rosemary Penwarden

LILYROSE1402 - Lily Rose - 0.0
lucaberta - Luca Bertagnolio - 0.0
enoch_macdhu - GJC - 0.0
ThomasP30944087 - Thomas Paine - 0.0
ejwensing - EJ - 0.0
ruizjunixr - junior ruiz - 0.0
XRWarwickDist - Extinction Rebellion Warwick District - 0.0
sydcharles - Sydney Charles - 0.0
Mia_1 - MB So much love & respect for Yellow Finch! - 0.0
milesobrien - Miles O'Brien - 0.0
avivainvestors - Aviva Investors - 0.0
anthmusic - anthmusic - 0.0
sblogga - Essie Bee - 0.0
keanmwong - kean - 0.0
AdrianMourby - Adrian Mourby - 0.0
CamGlobEdu - CamGlobEdu - 1.0
FrankWi74044551 - William S. Frank - 0.0
SciPerspective - Science and Perspective - 0.0
thegreenbeeeco - The Green Bee - 0.0
BarMarsh - Barbara Marsh - 0.0
Carlota87593465 - Carlota - 0.0
adilsonestaaqui - mikami, adilson - 0.0
AmbientDM - Brett Leuenberger - 0.0
Tereneh152XX - Grandma Fagan played baseball - 0.0
JohnOSullivan36 - John OSullivan - 0.0
theresalrosa - Theresa Rosa - 0.0
Sea1kay - Caroline Kay - 0.0
BlaiseGammie - Blaise Gammie - 0.

pabathukpa - Chhe Wa Ng - 0.0
Millar_Colin - Colin Millar - 0.0
rucafiorio - Rui Sousa - 0.0
Rahyithumeehaa - Maan Rashyd - 0.0
theSNP - The SNP - 0.0
SteveVicMorrow - Sir Thunderbird - 0.0
BrtFlame - BrightFlame - 0.0
petrovdempski - Petrov Dempski - 0.0
SanghiSwati - Swati Sanghi - 0.0
back_cis - KC from marketing - 0.0
NapaWatershed - ProtectNapaWatersheds - 0.0
DrCEriksen - Dr Christine Eriksen - 0.0
EvertHassink - Evert Hassink - 0.0
Dana71429535 - Dana - 0.0
TrevorHaynes575 - Trevor Haynes - 0.0
fathers_jane - Jane Fathers Davidson - 0.0
ChrisPa13807454 - Chris Parkin - 0.0
Rusthallred - Rusthall Socialist - 0.0
HUMANBEINGONE - ME - 0.0
Trace_plus2 - Tracy Monahan - 0.0
PaulPauloPablo - Paul - 0.0
ODDO_BHF - ODDO BHF - 0.0
Mrjoewade - Joe Wade - 0.0
cindyol15279630 - cindy oliver - 0.0
psychicsister - Annie - 0.0
SuburbanRainbow - Tanya Juliette Rebecca Lippmann - 0.0
hughdavidson2 - hugh davidson - 0.0
AnilGhelani - Anil Ghelani, CFA - 0.0
ajsmith_libdem - Cllr Anthony Smith - 0

CllrJude - Cllr Jude Wells - 0.0
Ross_coP - Ross_co - 0.0
h_mad_murdock - HM Murdock - 0.0
suez - SUEZ - 1.0
kakudiego - Diego Kaku - 0.0
Meplus3_40 - Meg Lampard - 0.0
andrese_sierra - Andrs Sierra - 0.0
OrenShuali - Oren Tsur - 0.0
AmandaGFrame - Aframe - 0.0
gullfire_ - Gullfire Over Leningrad - 0.0
OlgaSperansk - Olga Speranskaya - 0.0
davva23 - Davva23 - 1.0
JimBOBlbc - BASED JIMBOB - 0.0
BourryYang - Barry Young - 0.0
Danfoss - Danfoss Group - 0.3333333333333333
dunEdun22 - Eoin Dunne - 0.0
nycbotanist - MarielleAnzelone - 0.0
secularcitizen2 - Secular Citizen for - 0.0
sandergeelen - Sander Geelen - 0.0
David_T_2013 - David R Tomlinson - 0.0
SayftyCom - Sayfty.com - 0.0
lschiefner - Land of Lee - 0.0
yspearl - Angie Bower - 0.0
BDSdragonflies - British Dragonfly Society - 0.0
Holly_RR - Holly Robinson - 0.0
evefrancisholt - Eve Holt - 0.0
emilysmithLD - Emily Smith - 0.0
perthwolves1 - David - 0.0
SoCalValleyGal - Diane - 0.0
nadir_so - Nadir S - 0.0
dorchesterclaud - claudia so

MariaForestRang - Maria Forest Range - 0.0
JoolsMcCarthy - Jools - 0.0
frostyboy74 - Johnno - 0.0
ChrisJC12002 - Chris Cooper - 0.0
_HanniePhillips - Hannie Phillips - 0.0
Lyndamathews25 - Lynda mathews - 0.0
emuwren - Lyn Martinez - 0.0
James_Allen_TLA - James Allen - 0.0
MargoCHanson - Margo 'MECA' Hanson (Make Earth Cool Again) - 0.0
bribrisimps - brianna - 0.0
dr_hurford - Dr Grace Hurford - 0.0
purplewoo9 - Wendy Waghorn - 0.0
Global_GWI - Wellness Institute - 0.0
milleramy - Amy Miller - 0.0
jhall_m25 - Beachcomber - 0.0
macadamia_man - macadamia man - 0.0
mark_osmer - Mark Osmer - 0.0
JLB99902 - Jennifer Button - 0.0
rcgp - RCGP - 1.0
NewAngliaEnergy - Nigel Cornwall - 0.0
John09386301 - John - 0.0
CanMountains - WillSki - 0.0
CAROLINE2812 - Caroline Davis - 0.0
AnastasiosManol - Anastasios (Taso) Manolakis ( - 0.0
Aerin_J - Aerin Jacob - 0.0
CsmcConsumables - Pakin Limkangwanmongkol - 0.0
NRutschilling - Nolan Rutschilling - 0.0
FabianDattner - Fabian Dattner - 0.0
davidbewart 

DavidSullivanMN - David Sullivan - 0.0
kittyW92686167 - kittyW - 0.0
huns62 - mark galloway - 0.0
KTVL - News 10 - 0.0
ElisendaVillena - elisenda villena - 0.0
DyspepticCodger - Dyspeptic Codger - - 0.0
RosieAfterglow - juliemcintyre - 0.0
copyrose - Rose_DC - 0.0
eden_renewables - Eden Renewables - 0.0
markofbattersea - Mark - 0.0
MattHarlowe - Matt Harlowe - 0.0
KhunJoe5 - Joe Wong - 0.0
ConvenorRTPIS - RTPI Scot Convenor - 0.0
tamsau - tamsau - 1.0
BerndBurkert - TheBernd - 0.0
ugtheweirdo - Gerald Clarke - 0.0
KenDoggrell - Ken Doggrell - 0.0
CharlesCharest - Charles Charest - 0.0
_James_T - James Turner - 0.0
mantasledge - Sandal Woody - 0.0
fighting4human1 - fighting4humanrights - 0.0
gregfthompson - Greg Thompson - 0.0
Athol94519002 - Athol - 0.0
AndrewCampey - Andrew Campey - 0.0
cooltolerance - CoolTolerance - 1.0
Oxfam - Oxfam International - 0.3779644730092272
FASTHANDSED - EDWARD WHITE - 0.0
crepeseason - Henry Roberts - 0.0
RainyDaysAZ - RainyDaysAZ - 1.0
GeminiCloud9 - Al

chfrank_cgn - Christian Frank - 0.0
Eldever1 - BritishPatriot - 0.0
obsolete29 -  - 0.0
MithuValika - Mithu Valika - 0.0
SteveAMBurgess - Cllr Steve Burgess - 0.0
UBCCALP - UBC CALP - 0.0
JoeGodden2 - Joe Godden - 0.0
XR_Newbury - Extinction Rebellion Newbury - 0.0
TwitrSpace - SUSANr - 0.0
Wojtek_Widuch - Wojtek Widuch - 0.0
ShopGreenRetail - Shop Green - 0.0
ianmcdo03120397 - ian mcdonald - 0.0
sdachygus - Stphane - 0.0
ZackPolanski - Zack Polanski - 0.0
stefanscrivener - DJ Steffy - 0.0
bluja4 - bluja - 0.0
panlidsid - Floydarama - 0.0
Newswriter22 - Sean Meyer - 0.0
tarjuccia - Tarja van Veldhoven - 0.0
busyworker99 - Vegan - 0.0
Tombag_AU - TOMbag Reusable Garbage Bags - 0.0
beccarala - Tracking the Australian Right Wing ... - 0.0
CraftingPhool - Melody Dryer - 0.0
BrodiesLLP - Brodies LLP - 0.0
PollyMilner - Polly Milner - 0.0
bridger_steve - Steve Bridger - 0.0
mindful_PR_ - Mindful Public Relations - 0.0
Kimmi1965 - VivaLaResistance Resisting Fascism - 0.0
bordersthinking - Eri

water_our - Our Water Challenge - 0.0
CCitizenship - Corporate Citizenship - 0.0
StarrOutlook - (((StarrFaithful))) (Sarah) - 0.0
SasjaBeslik - Sasja Beslik - 0.0
EcocideOblivion - Enjoy the Time Left - 0.0
Gibbo_app - Gibbo - 0.0
BrettWHarper -  - 0.0
sharma33deep - kuch_shabd - 0.0
simonphippss - Simon Phipps - 0.0
DrKassandraPari -  - 0.0
Chuddehinutwash - Not Spenny - 0.0
GJWorthing - GlobalJusticeWorthin - 0.0
rowenahutson - Rowena - 0.0
WVProtect - WV Public Protection - 0.0
FrankWinfield4 - Frank Winfield - 0.0
cllrgemmadavies - gemma davies - 0.0
WillisaH - Willisa Hogarth - 0.0
BeyondCoalMI - Beyond Coal Michigan - 0.0
tundra_m - Tundra M - 0.0
lornagreens - Lorna Slater - 0.0
NEON_UK - NEON - 0.0
DrJoshuaRoose - Joshua Roose - 0.0
esdawt - S dot - 0.0
ShivajiShivaLaw - Shivaji Shiva - 0.0
undergpossum - (((possum))) - 0.0
SNP_Failures - SNP The CULT in Scotland - 0.0
ChloeLeffakis - Chloe Leffakis - 0.0
TrishaC76417148 - TrishaC - 0.0
FundIndieABC - OurABC - 0.0
fJanssen7 - F

HempSvEarth - HempSaveEarth - 0.0
hellomadeinpink - Made in Pink - 0.0
Jophine47312187 - Jophine - 0.0
Right_to_Vote16 - Youth Suffrage - 0.0
helloEV3 - helloEV - 0.0
Groovenuts - Groovenuts - 1.0
bridgesforindy - BFI - 0.0
Inclusion4UsAll - AbleWarrior - David they/them - 0.0
Design_andMedia - Samantha Hallam - 0.0
darrwynne - Darragh Wynne - 0.0
Tropical_IBEC - island biodiversity - 0.0
EYPPC_GA - Geography Primary / Early Years Committee @The_GA - 0.0
experience_city - ExperienceCity - 0.0
janemick - Jane Mickelborough - 0.0
WeiZhangAtmos - Wei Zhang - 0.0
straighttalk75 - richy - 0.0
WobRotson - Rob Watson - 0.0
judithwilcock - judith wilcock - 0.0
mollaroom - molly fuller - 0.0
TheWhogg - Andy Biden - 0.0
dajashby - Derrick Ashby - 0.0
aicloudsolution - Guru Prasad - 0.0
jsbaxter_ - John Baxter - 0.0
dragonf51085478 - dragonfly - 0.0
AbysmalChump - Charley Weatherill - 0.0
abasedmonihubbu - wellfuc - 0.0
natespamblue - mobage gang - 0.0
spmallee - Sean Mallee - 0.0
the_magrathean 

NXIndustrialRev - Next Industrial Revolution - 0.0
Carocazz - Caro - 0.0
MDeMocker - Mary DeMocker - 0.0
richearle1 - Rich Earle - 0.0
Gillian88788002 - Gillian Warren - 0.0
Kronopioz - Hot Sloths For Bernie - 0.0
vicki_happy - vicki - 0.0
IRAndyB - Andy black - 0.0
awright4645 - Andrew - 0.0
JournoPeter - Positive Tweet Pete - 0.0
CarrieM213 - Carrie M - 0.0
gnoll110 - Noel Kelly - 0.0
Octofish - Lucy Bjorck - 0.0
weesnowie - James - 0.0
GEF_Europe - Green European Foundation - 0.0
Vuduchick - Monica - 0.0
ShaiShimmy - Shimmy Shai - 0.0
SamMarkey - Sam Markey - 0.0
alicektg - Alice Grundy - 0.0
tweetingantonia - antonia jennings - 0.0
Stonecottages - Dr Julia Imrie - 0.0
taniapdx - Tania Jennings - 0.0
JimCresswell - Jim Cresswell - 0.0
Jessicaed - Jessica Edwards - 0.20412414523193154
TimWeiskel - Tim Weiskel - 0.0
Hopenothate5 - Hopenothate - 0.0
moohair - Moo Hair - 0.0
IvySchoepf - Ivy.Schoepf - 0.0
sdecarne - Sophie de Carn - 0.0
themarketchat - The Good Oil - 0.0
tmsophie - Soph

MaryMar87156109 - Mary Marvel God Bless - 0.0
Ganjm001 - Jeannine Malcolm - 0.0
TFinn82 - Tango Foxtrot - 0.0
AmyCat64 - Amy Caterina - 0.0
MayhewDebra - Debbie Mayhew - 0.0
JaneZayler - Nobody - 0.0
sheilawalker73 - Sheila Walker - 0.0
Samoskal - Steve Moskal - 0.0
cgarside - Claire Garside - 0.0
bruce_laidlaw - Bruce Laidlaw - 0.0
Hoddy67 - Hodders - 0.0
EdmundGemmell - EDMUND GEMMELL - 0.0
yvonne08938223 - Yvonne Doney - 0.0
13373r_milk - slow your roll - 0.0
vankapro - Vanka Pro - 0.0
symb01s - me - 0.0
mipsy1234 - Michelle - 0.0
BYAMAUS - Between You & Me - 0.0
Daniel12573760 - Zippytripi - 0.0
ArtPlayKids - ArtPlay - 0.0
WxhSr - XWxhSrFqZiSsik - 0.0
FiliaGrotii - Filia Grotii - 0.0
AnnaHFord - Anna Ford - 0.0
SamPrz68 - Sam Prez - 0.0
NikitaPatodia - Nikita Patodia - 0.0
phil_vickerman - Phil vickerman - 0.0
EachOtherUk - EachOther - 0.0
SlowMatt1 - Slow Matt - 0.0
MrHominidae - A Human Ape - 0.0
c_nield - C. Nield de Crespo - 0.0
REALISE_innOV8 - Simon James Whatley - 0.0
tahuds

squeake11329515 - squeaker - 0.0
ViolinMonster - Violin Monster - 0.0
madonnacw - CW - 0.0
bigdamo - Damian Haywood - 0.0
anactionmovie - EcoWarriors Rise - 0.0
ange_jenkins - Ange Jenkins - 0.0
britainsocean - Britain's Ocean City - 0.0
Sanityland - Growing Stronger Every Day - 0.0
san_chem - Li San Chem - 0.0
curvytents - Emazing Curvy Tents - 0.0
UtilityBidder - Utility Bidder - 0.0
other98 - The Other 98% - 0.0
jenvasic - Jen Vasic - 0.0
allanb21 - Allan Bodgers - 0.0
DerbysWildlife - Derbyshire Wildlife Trust - 0.0
AnarchistsWW - Anarchists Worldwide - 0.0
Tesz - Tesz - 1.0
tixLondon - What's On London (TicketSource) - 0.0
regeameya - Ameya Rege - 0.0
TMBeecher - Tim Beecher - 0.0
MrMcEnaney - James McEnaney - 0.0
natracare - natracare - 1.0
GillKing01 - Gillian King - 0.0
dialogger - Joanna Ashworth - 0.0
WhyTrustJesus - Why Trust Jesus? - 0.0
NemesisNibiru - CaptainJimDandy - 0.0
Kate4BelleVue - Kate Halliday - 0.0
JohnHal29271650 - Yacob@Work - 0.0
BarristersHorse - Barrister's

jetfury - Geoff Barrow - 0.0
_JamieMcIntyre - Jamie McIntyre - 0.0
CA_global - Christian Aid Global - 0.0
DonegalPost - DonegalPost - 1.0
KeeleBusiness - Keele Business Gateway - 0.0
cllrsamgorst - Councillor Sam Gorst - 0.0
NaturalSystemsE - Natural Systems Engineering - 0.0
g2bioemissions - andy sproat - 0.0
Cedders68 - Cedric Knight - 0.0
dreweryal - alan drewery - 0.0
bentbananabooks - Bernie Dowling - 0.0
RachGrocott4043 - Rachel Grocott - 0.0
will_ngiam - William Ngiam - 0.0
rRaaif - Raaif E. Rasheed - 0.0
Grendels_Den - Grendel's Den - 0.0
SamLeightonDore - Samuel Leighton-Dore - 0.0
Bluebird0309 - CllrJillHoulbrook - 0.0
pippiwakame - Wakame - 0.0
ElaineSamuels - Elaine Samuels - 0.0
angilucy - Angie Capezzuto - 0.0
michaelbremer1 - Michael B (ACTOR) - 0.0
EndTheEU - Sean Hyland - 0.0
TevyeMarksonLDR - Tevye Markson - 0.0
TinkerB1985 - TinkThinks - 0.0
A_WittenbergCox - Avivah Wittenberg-Cox - 0.0
thehorsman - Paul Horsman - 0.0
AlenKarabegovic - Alen Karabegovic - 0.0
PjStasse

joannarich - Joanna Richards - 0.0
selkiesun - andre brisson - 0.0
CUBSucc - CUBS - 0.0
LifeStyleRTE - LifeStyle on RT - 0.0
sandyowoo13 - Sandy Riggs - 0.0
BlackPhysicists - BlackPhysicists - 1.0
altNOAA - Alternative NOAA - 0.0
TanDhesi - Tanmanjeet Singh Dhesi MP - 0.0
dapapabear420 - PapaQBear WWG1WGA - 0.0
cecefashkiisu - cece - 0.0
being_gaurav5 - Gaurav Sharma - 0.0
astroevrafter - S.L. - 0.0
angelsbliss - Colin Jones - 0.0
imbrianmur - The Life of Brian M - 0.0
WBerksLibraries - West Berks Libraries - 0.0
nord_ravn - Eva - 0.0
markkelly4az - TeamPete 4 Mark Kelly 4 AZ - 0.0
MetalCoreMick - Yorkshire Mick - 0.0
element1519 - BillyPilgrim5 - 0.0
johnnydubb1 - johnnydubb - 0.0
Phewall1 - Phoebe Wallace - 0.0
esther_palet - Esther Palet - 0.0
FoundationVeer - Veer Foundation - 0.0
RodBruem - Broomhilda - 0.0
MarcellaBrasset - Marcella Brassett - 0.0
Right2CityGP - Global Platform - 0.0
ClaireHoganLGA - Claire Hogan - 0.0
robertcaruso - Robert Caruso - 0.0
ofer_n - Ofer Neiman - 0.0

dannygb61 - Fun City SE8 - 0.0
ChloeMinish - Dad Insid - 0.0
RobGPDX16 - Rob G - 0.0
Humdiha - Pen Jf - 0.0
reggie_999 - Chris 'reggie' Brown - 0.0
StephenRW01 - Stephen Williams - 0.0
FutureSouth_org - Future South - 0.0
HubOttawa - Impact Hub Ottawa - 0.0
CraigStevenFra1 - climate army 2020 - 0.0
LucyMoy - LucyMoy - 1.0
gstuy19 - gstuy - 0.0
PinnacleFreight - Pinnacle Int Freight - 0.0
ZenFrq - FrqZenFrog - 0.0
DevinNunesGoat1 - David Michael Rice The Writer of DESERT SOLILOQUY - 0.0
hadija_brahim - Hadija Brahim - 0.0
JulieMilligan7 - Jules - 0.0
PLASTICFREEH0ME - Plastic Free Home - 0.0
casadojor - Jorge Casado - 0.0
leroybox73 - Leroy - 0.0
wildefyre116 - Amanda Cochran - 0.0
DesertguySteve - Desertguyy - 0.0
lwlulu350 - Wild Goose - 0.0
PoorBlack1 - Pike Bishop - 0.0
ForNorwich - Eco Action Network Norwich - 0.0
PetroleumEcon - Petroleum Economist - 0.0
sapphofem - Julie Ann Richards - 0.0
markduerden - Mark Duerden - 0.0
TRBell01 - Tom Bell - 0.0
SarahWa72623615 - Sarah Warren -

In [10]:
# analysis of description...

In [11]:
# analysis of username and screenname similarity ...

In [76]:
# links to t
tweets_df = pd.read_csv("tweets_5000_accounts.csv")

In [75]:
tweets_df.head()

Unnamed: 0,dt,screen_name,text,tweetid
0,25/03/2020 05:32,JohnSarich2,RT @MadamEarth: FFS. How can we just sit at ho...,124268600000000.0
1,25/03/2020 05:31,JohnSarich2,RT @daveyk317: If it’s true that the Ruby Prin...,124268600000000.0
2,25/03/2020 05:31,JohnSarich2,I suppose this will be Morrison's thought bubb...,124268500000000.0
3,25/03/2020 05:28,JohnSarich2,@Pen2Paper43 @icred1 Yep. They're not up to it.,124268500000000.0
4,25/03/2020 05:27,JohnSarich2,RT @GayeCrispin: #auspol #COVID19Aus #covid19a...,124268500000000.0
