# Tweets_with_location Dataset

In [1]:
import pandas as pd

file_path = 'Tweets_with_location.csv'
try:
    tweets_df = pd.read_csv(file_path, encoding='ISO-8859-1')
except UnicodeDecodeError as e:
    print(f"Error loading file: {e}")

In [13]:
# Display the first few rows of the dataframe for an initial assessment
tweets_df.head()

Unnamed: 0,Name,UserName,Time,Text,Likes,Retweets,TweetURL,UserURL,City,Unnamed: 10,Unnamed: 11
0,Gerald Butts,gmbutts,2022-01-24 22:24:00,I'm glad serious researchers are taking up thi...,17,50,https://twitter.com/gmbutts/status/14857402380...,https://twitter.com/gmbutts,0,,
1,Gerald Butts,gmbutts,2022-01-24 22:27:00,1. Partisan adhesion in Canada is weak and par...,2,31,https://twitter.com/gmbutts/status/14857409939...,https://twitter.com/gmbutts,0,,
2,Abigail Boyd,AbigailBoydMLC,2022-03-07 23:46:00,"Minister Ayers, what are you doing to assist t...",8,12,https://twitter.com/AbigailBoydMLC/status/1500...,https://twitter.com/AbigailBoydMLC,0,,
3,Warren Gunnels,GunnelsWarren,2022-02-22 23:49:00,I want cable news to cover the child poverty r...,5418,203000,https://twitter.com/GunnelsWarren/status/14962...,https://twitter.com/GunnelsWarren,0,,
4,Scalawag,scalawagmag,2022-03-21 21:45:00,"""If we don't adapt the internet to the reality...",1,9,https://twitter.com/scalawagmag/status/1506024...,https://twitter.com/scalawagmag,0,,


## Data cleaning

In [7]:
# Handle Missing Data:

def handle_missing_data(df):
    # Create a copy of the DataFrame to avoid warnings
    df_cleaned = df.copy()
    
    # Drop rows with missing values in specified columns
    columns_to_drop_na = ['Name', 'UserName', 'Time', 'Text', 'TweetURL', 'UserURL']
    df_cleaned.dropna(subset=columns_to_drop_na, inplace=True)
    
    # Fill missing values in 'Likes' and 'Retweets' with 0
    df_cleaned['Likes'].fillna(0, inplace=True)
    df_cleaned['Retweets'].fillna(0, inplace=True)
    
    # Drop the 'Country' column if not needed
    if 'Country' in df_cleaned.columns:
        df_cleaned.drop(columns=['Country'], inplace=True)
    
    return df_cleaned

In [8]:
# Data Type Conversion:

def convert_data_types(df):
    # Create a copy of the DataFrame to avoid warnings
    df_converted = df.copy()
    
    # Convert 'Time' column to datetime
    df_converted['Time'] = pd.to_datetime(df_converted['Time'])
    
    # Convert 'Likes' and 'Retweets' to integers
    df_converted['Likes'] = df_converted['Likes'].astype(int)
    df_converted['Retweets'] = df_converted['Retweets'].astype(int)
    
    return df_converted

In [9]:
# Text Cleaning:

def clean_text(df):
    # Create a copy of the DataFrame to avoid warnings
    df_cleaned_text = df.copy()
    
    # Remove URLs from 'Text' column
    df_cleaned_text['Text'] = df_cleaned_text['Text'].apply(lambda x: re.sub(r'http\S+', '', x))
    
    # Add more text cleaning steps as needed, e.g., removing special characters, hashtags, mentions.
    
    return df_cleaned_text

In [14]:
# Clean the dataset
tweets_df = handle_missing_data(tweets_df)
print("\nDataFrame after handling missing data:")
print(tweets_df.head())

tweets_df = convert_data_types(tweets_df)
print("\nDataFrame after data type conversion:")
print(tweets_df.head())

tweets_df = clean_text(tweets_df)
print("\nDataFrame after text cleaning:")
print(tweets_df.head())

# Now, tweets_df contains the cleaned data.


DataFrame after handling missing data:
             Name        UserName                Time  \
0    Gerald Butts         gmbutts 2022-01-24 22:24:00   
1    Gerald Butts         gmbutts 2022-01-24 22:27:00   
2    Abigail Boyd  AbigailBoydMLC 2022-03-07 23:46:00   
3  Warren Gunnels   GunnelsWarren 2022-02-22 23:49:00   
4        Scalawag     scalawagmag 2022-03-21 21:45:00   

                                                Text  Likes  Retweets  \
0  I'm glad serious researchers are taking up thi...     17        50   
1  1. Partisan adhesion in Canada is weak and par...      2        31   
2  Minister Ayers, what are you doing to assist t...      8        12   
3  I want cable news to cover the child poverty r...   5418    203000   
4  "If we don't adapt the internet to the reality...      1         9   

                                            TweetURL  \
0  https://twitter.com/gmbutts/status/14857402380...   
1  https://twitter.com/gmbutts/status/14857409939...   
2  https:/