# DataFrame Builder

### Miguel Cozar and Carlos Munoz, OSNA

In [35]:
import pandas as pd
from TwitterUser import TwitterUser

#### 1- Clean the ChatGPT text dumped to an excel into a proper csv file

In [4]:
# Read the raw input
df = pd.read_excel('people.xlsx', header=None)
# Rename the column, perform string transformation to get the names and drop duplicates
df.rename({0: 'people'}, axis = 1, inplace=1)
df.people = df.people.apply(lambda x: x.split('.')[1].strip())
df = df.drop_duplicates()
# Save to a clean csv file
df.to_csv('cleaned_people.csv', index = False, encoding='utf-8-sig')
df.head(5)

Unnamed: 0,people
0,Joe Biden
1,Elon Musk
2,Greta Thunberg
3,Angela Merkel
4,Xi Jinping


#### 2- Search all using twitter API, it is very possible that some accounts are not found or wrongly found, but we are searching for nodes, not accuracy

In [6]:
# Define a twitter user instance
user = TwitterUser()

In [10]:
# Search each user on twitter and build a dataframe
user_dictionary = {}
for person in df.people.to_list():
    try:
        user_dictionary[person] = user.search_user(person)
    except:
        user_dictionary[person] = None
user_dictionary

{'Joe Biden': {'id': 939091,
  'id_str': '939091',
  'name': 'Joe Biden',
  'screen_name': 'JoeBiden',
  'location': 'Washington, DC',
  'description': 'Husband to @DrBiden, proud father and grandfather. Ready to build back better for all Americans. Official account is @POTUS.',
  'url': 'https://t.co/UClrPuJpyZ',
  'entities': {'url': {'urls': [{'url': 'https://t.co/UClrPuJpyZ',
      'expanded_url': 'http://joebiden.com',
      'display_url': 'joebiden.com',
      'indices': [0, 23]}]},
   'description': {'urls': []}},
  'protected': False,
  'followers_count': 36894438,
  'friends_count': 47,
  'listed_count': 40867,
  'created_at': 'Sun Mar 11 17:51:24 +0000 2007',
  'favourites_count': 25,
  'utc_offset': None,
  'time_zone': None,
  'geo_enabled': False,
  'verified': True,
  'statuses_count': 8894,
  'lang': None,
  'status': {'created_at': 'Tue Feb 07 21:02:21 +0000 2023',
   'id': 1623064645891129344,
   'id_str': '1623064645891129344',
   'text': 'Proud of what we’re getting 

In [11]:
# Some example of how it is
user_dictionary['Cristiano Ronaldo']

{'id': 155659213,
 'id_str': '155659213',
 'name': 'Cristiano Ronaldo',
 'screen_name': 'Cristiano',
 'location': '',
 'description': 'This Privacy Policy addresses the collection and use of personal information - https://t.co/N9W2J34YdA',
 'url': 'https://t.co/JlMkKHxKo1',
 'entities': {'url': {'urls': [{'url': 'https://t.co/JlMkKHxKo1',
     'expanded_url': 'http://www.facebook.com/cristiano',
     'display_url': 'facebook.com/cristiano',
     'indices': [0, 23]}]},
  'description': {'urls': [{'url': 'https://t.co/N9W2J34YdA',
     'expanded_url': 'http://www.cristianoronaldo.com/terms',
     'display_url': 'cristianoronaldo.com/terms',
     'indices': [79, 102]}]}},
 'protected': False,
 'followers_count': 107529005,
 'friends_count': 65,
 'listed_count': 85591,
 'created_at': 'Mon Jun 14 19:09:20 +0000 2010',
 'favourites_count': 6,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': True,
 'statuses_count': 3869,
 'lang': None,
 'status': {'created_at': 'Fr

In [26]:
# Now we get the keys that we want for our final dataset
preferred_keys = ['id', 'name', 'screen_name', 'location', 'followers_count', 'friends_count']

# And we extract them from the dictionary, we will make a simple function to protect from None values
def extract_key(user, key):
    try:
        return user_dictionary[user][key]
    except:
        return ''

for key in preferred_keys:
    df[key] = df.people.apply(lambda x: extract_key(x, key))

df.head(6)

Unnamed: 0,people,id,name,screen_name,location,followers_count,friends_count
0,Joe Biden,939091,Joe Biden,JoeBiden,"Washington, DC",36894438,47
1,Elon Musk,44196397,Elon Musk,elonmusk,,128325520,178
2,Greta Thunberg,1006419421244678144,Greta Thunberg,GretaThunberg,Sverige,5848016,2866
3,Angela Merkel,102550061,Plaid Angela Merkel,Plaid_Merkel,Berlin (not),33770,308
4,Xi Jinping,1483753685616365571,Xi Jinping 🇨🇳 ᶠᵃᵏᵉ,XoiXiJinping,,16086,185
5,Jeff Bezos,15506669,Jeff Bezos,JeffBezos,,6118040,173


We can see that in some cases the account won't be properly found, like in Mr. Jinping case

In [27]:
# See None values, to check if the dataset is acceptable
df[df.id == '']

Unnamed: 0,people,id,name,screen_name,location,followers_count,friends_count
241,Isabel Ge Mahe,,,,,,
580,Anastasija Sevastova,,,,,,


In [29]:
df.drop([241, 580], inplace = True)

#### 3-  Now we have the Updated Dataframe

In [33]:
df.head(5)

Unnamed: 0,people,id,name,screen_name,location,followers_count,friends_count
0,Joe Biden,939091,Joe Biden,JoeBiden,"Washington, DC",36894438,47
1,Elon Musk,44196397,Elon Musk,elonmusk,,128325520,178
2,Greta Thunberg,1006419421244678144,Greta Thunberg,GretaThunberg,Sverige,5848016,2866
3,Angela Merkel,102550061,Plaid Angela Merkel,Plaid_Merkel,Berlin (not),33770,308
4,Xi Jinping,1483753685616365571,Xi Jinping 🇨🇳 ᶠᵃᵏᵉ,XoiXiJinping,,16086,185


In [34]:
df.to_csv('cleaned_people.csv', index=None, encoding='utf-8-sig')