# Setting up the functions for data cleaning

In [1]:
import pandas as pd
import time

In [2]:
# Setup Top 100 list for comparison:

top100 = ["realDonaldTrump", "WhiteHouse", "TeamTrump", "GOPChairwoman", "DanScavino", "Jim_Jordan", "GOP", "Scavino45", "DonaldJTrumpJr",
          "IvankaTrump", "GreggJarrett", "EricTrump", "TomFitton", "paulsperry", "RepMarkMeadows", "marklevinshow", "TrumpWarRoom",
          "LindseyGrahamSC", "charliekirk11", "dbongino", "SenateGOP", "GOPLeader", "JudicialWatch", "senatemajldr", "foxandfriends",
          "seanhannity", "VP", "SteveScalise", "MZHemingway", "FLOTUS", "LouDobbs", "DailyCaller", "BuckSexton", "RepAndyBiggsAZ", "RepLeeZeldin",
          "KimStrassel", "Mike_Pence", "MariaBartiromo", "PressSec", "SaraCarterDC", "RepDougCollins", "RepMattGaetz", "NHC_Atlantic",
          "JohnWHuber", "MarshaBlackburn", "AndrewCMcCarthy", "RandPaul", "IngrahamAngle", "TVNewsHQ", "jsolomonReports",
          "CLewandowski_", "parscale", "GOPoversight", "ByronYork", "FoxNews", "GeraldoRivera", "DevinNunes", "BreitbartNews", "SenTomCotton",
          "dcexaminer", "thebradfordfile", "RNCResearch", "seanmdav", "JackPosobiec", "CDCgov", "marcorubio", "Lrihendry",
          "SenTedCruz", "thehill", "PollWatch2020", "OANN", "DavidJHarrisJr", "JennaEllisEsq", "NRA", "Varneyco", "kayleighmcenany", "RudyGiuliani",
          "hughhewitt", "JesseBWatters", "HouseGOP", "RealJamesWoods", "fema", "KatrinaPierson", "SenJohnBarrasso", "ericbolling", "HawleyMO",
          "thejtlewis", "TimMurtaugh", "RichardGrenell", "tedcruz", "bennyjohnson", "TheRightMelissa", "KellyannePolls", "SenRonJohnson",
          "SenThomTillis", "EliseStefanik", "brithume", "ChuckGrassley"]

In den Top 100 sind nur 98 Follower:

* 1 Account ist privat und kann nicht gescraped werden (`"DRUDGE_REPORT"`)
* Mike Pence kommt 2x vor, sein handle war bis 2016 `mike_pence` und ist seither `Mike_Pence`. Da Twitter bei Handles nicht zwischen Gross- und Kleinschreibung unterscheidet ist das der selbe Account.

Die folgende Funktion bereitet die Daten so auf, damit sie in weiterer Folge auf Follower überprüft und weiter verarbeitet werden können:

In [3]:
def prepare_friends_data(filename):
    """
    :param filename: Twitter-handle used in the filenames
    :returns: DataFrame with clean columns
    """
    data = pd.read_csv(f"friends/{filename}_friends.csv")

    # Insert Column 1 and fill with the respective account name:
    data.insert(0, "Account", f"{filename}")

    # Rename columns:
    data.columns = ["Account", "Follows"]
    return data

Die folgende Funktion überprüft, ob Accounts im DataFrame in der top100-Liste vorkommen und droppt alle anderen. Ausgegeben wird ein Data Frame mit folgender Struktur:

```
+-------------------------+
| Account      | Follows  |
+--------------+----------+
| Account-Name | Friend 1 |
| Account-Name | Friend 2 |
| Account-Name | Friend 3 |
| ...          | ...      |
+--------------+----------+
```

In [4]:
def clean_friends_list(data):
    """
    :param data: Clean data set as obtained from function prepare_friends_data()
    :returns: DataFrame which only contains Friends accounts that appear in the top100 list
    """
    for index, row in data.iterrows():
        if row["Follows"] not in top100:
            data.drop(index, inplace = True)
    data.reset_index(inplace = True, drop = True)
    return data

### Test der beiden Funktionen:

In [5]:
test = prepare_friends_data("WhiteHouse")
print(test)

       Account          Follows
0   WhiteHouse    robertcobrien
1   WhiteHouse   StephGrisham45
2   WhiteHouse       Mike_Pence
3   WhiteHouse         OMBPress
4   WhiteHouse            WHNSC
5   WhiteHouse          Cabinet
6   WhiteHouse   KellyannePolls
7   WhiteHouse        Scavino45
8   WhiteHouse       SecondLady
9   WhiteHouse         PressSec
10  WhiteHouse               VP
11  WhiteHouse           FLOTUS
12  WhiteHouse  realDonaldTrump
13  WhiteHouse            POTUS


In [8]:
test2 = clean_friends_list(test)
print(test2)

      Account          Follows
0  WhiteHouse       Mike_Pence
1  WhiteHouse   KellyannePolls
2  WhiteHouse        Scavino45
3  WhiteHouse         PressSec
4  WhiteHouse               VP
5  WhiteHouse           FLOTUS
6  WhiteHouse  realDonaldTrump


# Cleaning data in a loop

In [10]:
def hausputz(list_to_clean):
    """
    :param list_to_clean: List of Twitter handles whose CSV files should be cleaned.
    :returns: CSV file with all data cleaned and DataFrame with clean data.
    """
    complete_data = pd.DataFrame(columns = ["Account", "Follows"])
    for account in list_to_clean:
        data_clean = clean_friends_list(prepare_friends_data(account))
        complete_data = complete_data.append(data_clean, ignore_index = True)
        print(f"Account {account} analyzed.")
    complete_data.drop([0])
    complete_data.to_csv("followers_complete.csv", index = False)
    return complete_data

In [12]:
# Determine start time:
start_time = time.time()
print("Start time: " + str(start_time))

all_friends = hausputz(top100)

# Detrmine end time and time needed for execution:
print("Finishing time: " + str(time.time()))
print("Time for execution (sec.): " + str(time.time() - start_time))

Start time: 1607442548.4376347
Account realDonaldTrump analyzed.
Account WhiteHouse analyzed.
Account TeamTrump analyzed.
Account GOPChairwoman analyzed.
Account DanScavino analyzed.
Account Jim_Jordan analyzed.
Account GOP analyzed.
Account Scavino45 analyzed.
Account DonaldJTrumpJr analyzed.
Account IvankaTrump analyzed.
Account GreggJarrett analyzed.
Account EricTrump analyzed.
Account TomFitton analyzed.
Account paulsperry analyzed.
Account RepMarkMeadows analyzed.
Account marklevinshow analyzed.
Account TrumpWarRoom analyzed.
Account LindseyGrahamSC analyzed.
Account charliekirk11 analyzed.
Account dbongino analyzed.
Account SenateGOP analyzed.
Account GOPLeader analyzed.
Account JudicialWatch analyzed.
Account senatemajldr analyzed.
Account foxandfriends analyzed.
Account seanhannity analyzed.
Account VP analyzed.
Account SteveScalise analyzed.
Account MZHemingway analyzed.
Account FLOTUS analyzed.
Account LouDobbs analyzed.
Account DailyCaller analyzed.
Account BuckSexton analyz

Ausführungszeit: 2011.23 Sekunden (ca. 33 Minuten).

In [13]:
all_friends

Unnamed: 0,Account,Follows
0,realDonaldTrump,LouDobbs
1,realDonaldTrump,GOPLeader
2,realDonaldTrump,senatemajldr
3,realDonaldTrump,Jim_Jordan
4,realDonaldTrump,MariaBartiromo
...,...,...
4597,ChuckGrassley,LouDobbs
4598,ChuckGrassley,HouseGOP
4599,ChuckGrassley,GOPoversight
4600,ChuckGrassley,SenateGOP
