## Remove Duplicate Rows

In [1]:
# Importing the csv file for after the data collection phase

import pandas as pd
tweets_df = pd.read_csv('Dataset2.csv', lineterminator='\n')

In [2]:
# Original Dataframe formed from the tweets csv file: (100,000 rows originally)

tweets_df

Unnamed: 0,0,1
0,1488300975022649351,@allballsofyarn Thank Pfizer for Jacinta eh?
1,1488295746369695751,UK Health Secretary Announces U-TURN on MANDAT...
2,1488289753548083207,@georgiebingham The word hypocrite the only on...
3,1488287987683278850,@Seyirhodes There's about 4 chapters on aids v...
4,1488285997674770436,@benking01 @NHS Hope you’re feeling ok Ben &am...
...,...,...
19564,1465846782156546053,This article is far too long but way down it c...
19565,1465846406212730883,What a fucking joke this. The vaccine was mean...
19566,1465843105190535174,Faster vaccine rollout = longer #NHS waiting l...
19567,1465842266036224004,Why would a GP want to waste their time checki...


In [3]:
# Remove duplicate rows: (54931 Rows left now)

tweets_df = tweets_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
tweets_df

Unnamed: 0,0,1
0,1488300975022649351,@allballsofyarn Thank Pfizer for Jacinta eh?
1,1488295746369695751,UK Health Secretary Announces U-TURN on MANDAT...
2,1488289753548083207,@georgiebingham The word hypocrite the only on...
3,1488287987683278850,@Seyirhodes There's about 4 chapters on aids v...
4,1488285997674770436,@benking01 @NHS Hope you’re feeling ok Ben &am...
...,...,...
19564,1465846782156546053,This article is far too long but way down it c...
19565,1465846406212730883,What a fucking joke this. The vaccine was mean...
19566,1465843105190535174,Faster vaccine rollout = longer #NHS waiting l...
19567,1465842266036224004,Why would a GP want to waste their time checki...


## Rename Columns

In [4]:
# Renaming Columns to useful names:

tweets_df = tweets_df.rename({'0': 'UserID', '1': 'Original Tweet'}, axis=1)
tweets_df

Unnamed: 0,UserID,Original Tweet
0,1488300975022649351,@allballsofyarn Thank Pfizer for Jacinta eh?
1,1488295746369695751,UK Health Secretary Announces U-TURN on MANDAT...
2,1488289753548083207,@georgiebingham The word hypocrite the only on...
3,1488287987683278850,@Seyirhodes There's about 4 chapters on aids v...
4,1488285997674770436,@benking01 @NHS Hope you’re feeling ok Ben &am...
...,...,...
19564,1465846782156546053,This article is far too long but way down it c...
19565,1465846406212730883,What a fucking joke this. The vaccine was mean...
19566,1465843105190535174,Faster vaccine rollout = longer #NHS waiting l...
19567,1465842266036224004,Why would a GP want to waste their time checki...


## Remove URLs:

In [5]:
# Function to replace URL of a text with an empty space:

def clean_data_from_urls(dataframe):
    dataframe['Without URL'] = dataframe['Original Tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'URL')

In [6]:
# Apply above url remover function and print the new dataframe with the new column

clean_data_from_urls(tweets_df)
tweets_df['Without URL'].head(10)

  dataframe['Without URL'] = dataframe['Original Tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'URL')


0         @allballsofyarn Thank Pfizer for Jacinta eh?
1    UK Health Secretary Announces U-TURN on MANDAT...
2    @georgiebingham The word hypocrite the only on...
3    @Seyirhodes There's about 4 chapters on aids v...
4    @benking01 @NHS Hope you’re feeling ok Ben &am...
5    @CarrDutton @drstevejames Why are all double-b...
6    @bbclaurak The press are so biased towards The...
7    @AndrewJPelling We have universal availability...
8    @sebastianbach Seb. \nVaccines have some effec...
9    @Tobias_Ellwood Interesting processing. I don'...
Name: Without URL, dtype: object

## Replace Handlers/Usernames:

In [7]:
# Function to Replace Handlers/Usernames in the Dataframe:

def replace_handlers(dataframe):
    dataframe['Removed Handlers'] = dataframe['Without URL'].str.replace('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', 'USER')

In [8]:
# Apply above handler remover function and print the new dataframe with the new column

replace_handlers(tweets_df)
tweets_df['Removed Handlers'].head(10)

  dataframe['Removed Handlers'] = dataframe['Without URL'].str.replace('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)', 'USER')


0                    USER Thank Pfizer for Jacinta eh?
1    UK Health Secretary Announces U-TURN on MANDAT...
2    USER The word hypocrite the only ones can comp...
3    USER There's about 4 chapters on aids vaccine ...
4    USER USER Hope you’re feeling ok Ben &amp; tak...
5    USER USER Why are all double-barrelled nomencl...
6    USER The press are so biased towards The Prime...
7    USER We have universal availability of effecti...
8    USER Seb. \nVaccines have some effect.\nMaybe ...
9    USER_Ellwood Interesting processing. I don't t...
Name: Removed Handlers, dtype: object

## Publish to CSV:

In [9]:
tweets_df.to_csv('After-Pre-Processing(Temp - Timeframe2).csv', encoding='utf-8', index=False)