In [16]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

def clean_data(df, drop_na=True, drop_irrelevant=True):
    # Clean values that are not  1 or 0 
    df.at[596, 'Not about Sudan'] = 0
    df.at[680, 'pro RSF'] = 0
    df.at[774, 'Likely bot'] = 0
    df.at[774, 'Likely not a bot'] = 0
    df.at[687, 'anti SAF'] = 0

    # Set the digits at the end of the permalink as the unique code for each tweet
    permalinks = df['permalink'].values
    codes = [permalink.split('/')[-1] for permalink in permalinks]
    df['code'] = codes

    # Select the columns that are relevant to the analysis
    subset=['anti RSF', 'pro RSF', 'anti SAF', 'pro SAF', 'Pro peace,', 'anti peace', 'Pro War', 'anti war', 'pro civilian', 'anti civilians', 'Sudanese', 'Not Sudanese']

    # Drop na values in the labels subset and fill the rest with 0 or unknown if it is a string
    if drop_na:
        df = df.dropna(subset=subset)
        df[['user', 'username']] = df[['user', 'username']].fillna('unknown')
        df = df.fillna(0)

    # Drop irrelevant tweets that are not about Sudan
    if drop_irrelevant:
        df = df[df['Not about Sudan'] == 0]
    
    df = df.reset_index(drop=True)
    return df


# Load the data
data = pd.read_excel('../data/data.xlsx')

# clean the data
data = clean_data(data)

# Display the first 5 rows and the shape of the data
display(data.shape)
display(data.head())

# Save the cleaned data
data.to_csv('../data/cleaned_data.csv', index=False)
display('Data saved successfully to ../data/cleaned_data.csv')

(613, 39)

Unnamed: 0,user,username,timestamp,post,permalink,reposts,likes,impressions,quotes,replies,bookmarks,value,report_file,date,time,relevance,anti RSF,pro RSF,anti SAF,pro SAF,"Pro peace,",anti peace,Pro War,anti war,pro civilian,anti civilians,not specified,no polarisation,Geopolticis,Sudanese,Not Sudanese,Sudanese N/A,Likely bot,Likely not a bot,Cannot be identified.,News,Not about Sudan,Annotation Confidence,code
0,☬ود نيـالا ☬,@Mo7amedWagi7,2023-04-15 15:29:44,@OmerElameen2 @drhzagalo لما يكون عندك حكومة و...,https://www.twitter.com/user/status/1647260928...,0,1,310,0,0,0,0.03,Report 1-1,2023-04-15,15:29:44,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,9.0,1647260928231055360
1,عربي21,@Arabi21News,2023-08-28 20:02:00,"الخارجية السودانية توضح لـ""عربي21"" حقيقة زيارة...",https://www.twitter.com/user/status/1696251807...,1,5,3402,1,1,0,3115.19,Report 3-2,2023-08-28,20:02:00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,10.0,1696251807083758021
2,تحالف عاصفة الحزم,@AaSsiFfah2015,2023-04-15 22:52:29,#السودان \nنشرة المستشفيات:\nالحرب حتى الآن تس...,https://www.twitter.com/user/status/1647372349...,0,1,472,0,0,0,15.58,Report 1-3,2023-04-15,22:52:29,1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,8.0,1647372349585358848
3,H_Lamami,@HLamami2,2023-04-15 13:36:46,■اذا رايت الاخوان يحبون حميدتي ويطبلون له\nفاع...,https://www.twitter.com/user/status/1647232497...,0,0,72,0,0,0,4.04,Report 1-2,2023-04-15,13:36:46,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0,8.0,1647232497238028289
4,وفاء علي,@eali_wafa15327,2023-08-29 12:17:05,@taherabdulwhab @Mohmd_Elsiddig @saramubael م ...,https://www.twitter.com/user/status/1696497195...,0,4,30,0,1,0,0.05,Report 3-1,2023-08-29,12:17:05,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,8.0,1696497195434881292


'Data saved successfully to ../data/cleaned_data.csv'