In [1]:
import pandas as pd
from datetime import datetime
from pytz import timezone
import pytz

## Read CSV Data

In [2]:
data_df = pd.read_csv('data.csv')
data_df = data_df.drop(columns = 'Unnamed: 0')
data_df

Unnamed: 0,Title,Link,Image,Description
0,Product Management Live Chat by Booking.com PM,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...,"['Allen Foster Pushparaj', 'Thursday, June 18t..."
1,Webinar: Building Great Relationships With You...,https://www.eventbrite.com/e/webinar-building-...,https://assets.productschool.com/wp-content/up...,"['Jean-Pierre Pequito', 'Thursday, June 18th, ..."
2,Webinar: How to be More Effective Data-Driven ...,https://www.eventbrite.com/e/webinar-how-to-be...,https://assets.productschool.com/wp-content/up...,"['Emile Saad', 'Thursday, June 18th, 2020', '2..."
3,Webinar: Driving AI Incubations as a PM by Zil...,https://www.eventbrite.com/e/webinar-driving-a...,https://assets.productschool.com/wp-content/up...,"['Debapriya Basu', 'Friday, June 19th, 2020', ..."
4,Webinar: Taking Care of your Product by HubSpo...,https://www.eventbrite.com/e/webinar-taking-ca...,https://assets.productschool.com/wp-content/up...,"['Klajdi Turlla', 'Friday, June 19th, 2020', '..."
...,...,...,...,...
93,Product Management Live Chat by fmr Northweste...,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...,"['Vivek Bedi', 'Thursday, August 20th, 2020', ..."
94,Product Management Live Chat by Airbnb PM,https://www.eventbrite.com/e/product-managemen...,https://www.productschool.com/wp-content/theme...,"['Azad Zahoory', 'Tuesday, August 25th, 2020',..."
95,Product Management Live Chat by Drift Director...,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...,"['Maggie Crowley', 'Thursday, August 27th, 202..."
96,Product Management Live Chat by Google Product...,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...,"['Tyson Mao', 'Tuesday, September 1st, 2020', ..."


## Clean Columns

In [3]:
# Remove Webinar word
data_df['Title'] = data_df.Title.str.replace('Webinar: ', '')

In [4]:
# get Datatype of Description and extract all the details.
data_df['Description'] = pd.eval(data_df['Description'])
data_df[["Author", "Date", "time", "medium"]] = data_df.Description.apply(pd.Series)
data_df = data_df.drop(columns = "Description")

### Preprocess times and timezones

In [5]:
# Get start time, end time and timezone.
data_df[['start_time', 'end_time']] = data_df['time'].str.split(' - ', expand=True)
data_df[['end_time', 'time_zone']] = data_df['end_time'].str.rstrip().str.split(' ', expand=True)
data_df.time_zone = data_df.time_zone.fillna("time_zone_not_filled") #Fill in information which can be used downstream.

data_df = data_df.drop(columns = "time")

In [6]:
try:
    data_df['time_zone'] = data_df['time_zone'].apply(lambda x: x.replace('(','').replace(')','')) 
except AttributeError as err:
    print("ATTRIBUTE ERROR:"+ str(err))

In [7]:
# This may not be required but doing as of now.
def convert_to_24hours(time):
    try:
        time24 = datetime.strptime(time, '%I:%M%p')
        return datetime.strftime(time24, "%H:%M")
    except ValueError as err:
        print("VALUEERROR:" + str(err))
    except TypeError as err:
        print("TYPEERROR:" + str(err))
    

In [8]:
data_df['start_time'] = data_df['start_time'].apply(convert_to_24hours)

VALUEERROR:time data 'Online' does not match format '%I:%M%p'
VALUEERROR:time data 'Online' does not match format '%I:%M%p'


In [9]:
data_df['end_time'] = data_df['end_time'].apply(convert_to_24hours)

TYPEERROR:strptime() argument 1 must be str, not None
TYPEERROR:strptime() argument 1 must be str, not None


In [10]:
data_df[['dow', 'date', 'year']] = data_df.Date.str.split(',', expand=True)

In [11]:
data_df['date'] = data_df['date'].str[0:-2]

In [12]:
data_df['start_time'] = data_df['date'] + " " + data_df['year'] + " " + data_df['start_time']

In [13]:
data_df['end_time'] = data_df['date'] + " " + data_df['year'] + " " + data_df['end_time']

In [14]:

def get_in_ist_timezone(row, col="start_time"):
    start_time = row[col]
    time_zone = row['time_zone']
    try:
        tz = None
        # Depending on the new timezones, you will have to add more conditions.
        # Refer to this link: https://gist.github.com/heyalexej/8bf688fd67d7199be4a1682b3eec7568
        # for all the timezones available.
        if time_zone == 'PT': 
            tz = pytz.timezone('US/Pacific') 
        start_time = start_time.rstrip()
        start_time = start_time.lstrip()
        start_time = start_time.replace('  ', ' ')
        start_time = datetime.strptime(start_time, '%B %d %Y %H:%M')
        start_time = tz.localize(start_time)
        return start_time.astimezone(pytz.timezone('Asia/Calcutta'))
    except Exception as err:
        print(str(err))
        return None

In [15]:
data_df['start_time_ist'] = data_df.apply(get_in_ist_timezone, col='start_time', axis=1)
data_df['end_time_ist'] = data_df.apply(get_in_ist_timezone, col='end_time', axis=1)

'float' object has no attribute 'rstrip'
'float' object has no attribute 'rstrip'
'NoneType' object has no attribute 'localize'
'float' object has no attribute 'rstrip'
'float' object has no attribute 'rstrip'
'NoneType' object has no attribute 'localize'


In [16]:
data_df['event_duration'] = data_df['end_time_ist'] - data_df['start_time_ist']

## Produce the required columns

In [17]:
data_df = data_df.rename(
    columns = {
        "Title":"Topic of the webinar",
        "Link": "Link of the webinar",
    }
)

In [18]:
data_df['Date (DD-Mon-YY)'] = data_df.start_time_ist.dt.strftime("%d-%b-%Y")

In [19]:
data_df['Start Time (hh:mm AM/PM)'] = data_df.start_time_ist.dt.strftime("%I:%M %p")

In [20]:
data_df['End  Time (hh:mm AM/PM)'] = data_df.end_time_ist.dt.strftime("%I:%M %p")

In [21]:
data_df['Duration (in min)'] =  ((data_df['end_time_ist'] -
                                      data_df['start_time_ist'])
                                     .astype('timedelta64[m]'))

data_df['Duration (in min)'] = data_df['Duration (in min)'].astype('str').str[0:-2] + ' min'

In [22]:
data_df["Organising Body"] = None

In [23]:
data_df["Theme"] = None

In [24]:
select_cols = ["Topic of the webinar",
"Date (DD-Mon-YY)",
"Start Time (hh:mm AM/PM)",
"End  Time (hh:mm AM/PM)",
"Duration (in min)",
"Organising Body",
"Theme",
"Link of the webinar",
"Image",
# "Key Take aways / Agenda",
# "Platform",
# "Price (Free/Paid)",
# "How did you hear about this event?",
# "Your Name",
# "Comments and/or questions",
# "Email Address",
# "Max Number of participants?",
# "Timestamp"
              ]

In [25]:
data_df[select_cols]

Unnamed: 0,Topic of the webinar,Date (DD-Mon-YY),Start Time (hh:mm AM/PM),End Time (hh:mm AM/PM),Duration (in min),Organising Body,Theme,Link of the webinar,Image
0,Product Management Live Chat by Booking.com PM,18-Jun-2020,11:00 PM,11:30 PM,30 min,,,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...
1,Building Great Relationships With Your Team by...,19-Jun-2020,12:00 AM,12:30 AM,30 min,,,https://www.eventbrite.com/e/webinar-building-...,https://assets.productschool.com/wp-content/up...
2,How to be More Effective Data-Driven PM by fmr...,19-Jun-2020,02:30 AM,03:00 AM,30 min,,,https://www.eventbrite.com/e/webinar-how-to-be...,https://assets.productschool.com/wp-content/up...
3,Driving AI Incubations as a PM by Zillow Princ...,20-Jun-2020,12:00 AM,12:30 AM,30 min,,,https://www.eventbrite.com/e/webinar-driving-a...,https://assets.productschool.com/wp-content/up...
4,Taking Care of your Product by HubSpot PM,20-Jun-2020,02:30 AM,03:00 AM,30 min,,,https://www.eventbrite.com/e/webinar-taking-ca...,https://assets.productschool.com/wp-content/up...
...,...,...,...,...,...,...,...,...,...
93,Product Management Live Chat by fmr Northweste...,20-Aug-2020,11:00 PM,11:30 PM,30 min,,,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...
94,Product Management Live Chat by Airbnb PM,25-Aug-2020,11:00 PM,11:30 PM,30 min,,,https://www.eventbrite.com/e/product-managemen...,https://www.productschool.com/wp-content/theme...
95,Product Management Live Chat by Drift Director...,27-Aug-2020,11:00 PM,11:30 PM,30 min,,,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...
96,Product Management Live Chat by Google Product...,01-Sep-2020,11:00 PM,11:30 PM,30 min,,,https://www.eventbrite.com/e/product-managemen...,https://assets.productschool.com/wp-content/up...
