## Data Preparation
1. Deleting talks with missing values
2. Cleaning up transcripts
3. Data Exploration


In [None]:
import pandas as pd
import json

import numpy as np
import matplotlib.pyplot as plt

import pickle 
import re

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
sns.set_style("darkgrid")
from copy import deepcopy

%pylab inline

%matplotlib inline

In [None]:
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

In [None]:
df = pd.read_csv('../Data/TED_Talks_FullSet.csv', encoding='latin-1')

In [None]:
df

In [None]:
listCol = list(df.columns)

In [None]:
listCol

#### Storing only columns that are information fields from TED

In [None]:
Data[(Data['year_filmed']== 17.72)]

In [None]:
Data[(Data['year_filmed']==  15.1)]

In [None]:
Data[(Data['year_filmed']==  17.75)]

Deleting the above rows of data

In [None]:
Data = Data[Data.year_filmed != 18.41]
Data = Data[Data.year_filmed != 17.72]
Data = Data[Data.year_filmed != 15.1]
Data = Data[Data.year_filmed != 17.75]


#### Checking if rows were deleted

In [None]:
Data.year_filmed.unique()

In [None]:
Data[Data['year_filmed']==  1984.]

### Reset Index

In [None]:
Data = Data.reset_index(drop=True)

## Data Exploration

### No. of Talks

In [None]:
Data[Data['headline']=='WhatÛªs wrong with your pa$$w0rd?']

In [None]:
Data = Data.replace('WhatÛªs wrong with your pa$$w0rd?', 'What\'s wrong with your password?')

In [None]:
Data[Data['headline']=='What\'s wrong with your password?']

In [None]:
import pickle 

In [None]:
with open('../Data/DataDF.pkl', 'wb') as picklefile:
    pickle.dump(Data, picklefile)

In [None]:
with open('../Data/DataDF.pkl', 'rb') as picklefile5:
    Data2 = pickle.load(picklefile5)


In [None]:
Data2

# Popularity Exploration

## Sorting the talks by the no. of views

#### Sorting by No. of Views (High to low)

In [None]:
Data.sort_values(by=['views_as_of_06162017'],ascending=False)

### Top 25

In [None]:
Data.groupby('year_filmed').id.count()

### Date/Year Published Analysis

In [None]:
import datetime as dt

In [None]:
Data['date_published'] = pd.to_datetime(Data['date_published'])

In [None]:
#check change
Data.date_published.dtypes

In [None]:
# Creating columns for date and month of publications
Data['Pub_Year'] = Data['date_published'].dt.year
Data['Pub_Month'] = Data['date_published'].dt.month

In [None]:
Data.head()

### No. of talks published by Year

In [None]:
Data.groupby('Pub_Year').id.count()

## Removing Time (5:30), \r,\n,(Laughter),(Applause)

In [None]:
Data['transcript'].head(2)

Use http://www.rubular.com/r/JsdNM3nFJ3


# Checks

In [None]:
with open('../Data/DataDF.pkl', 'rb') as picklefile5:
    Data2 = pickle.load(picklefile5)

### Check DF

In [None]:
Data2.head(2)

In [None]:
Data2.tail()

In [None]:
Data2.shape

### Check Speaker Details

In [None]:
Data2[Data2.speaker == "Amy Cuddy"]

### Check Speech Details with Index No.

In [None]:
Data2.iloc[1260,:]

In [None]:
Data2.iloc[1260:1261,:]   #Above in DF format

### Check Transcript with Index No.

In [None]:
top25 = Data.sort_values(by=['views_as_of_06162017'],ascending=False).head(25)

In [None]:
top25

In [None]:
with open('../Data/Top25.pkl', 'wb') as picklefile2:
    pickle.dump(top25, picklefile2)

#### Bottom25

In [None]:
Data = df.filter(['id',
 'speaker',
 'headline',
 'URL',
 'description',
 'transcript_URL',
 'month_filmed',
 'year_filmed',
 'event',
 'duration',
 'date_published',
 'views_as_of_06162017',
 'tags',
 'transcript'])

In [None]:
Data

In [None]:
Data.dtypes

### Clean up Missing Transcipts

In [None]:
bottom25 = Data.sort_values(by=['views_as_of_06162017'],ascending=False).tail(25)
bottom25

In [None]:
with open('../Data/Bottom25.pkl', 'wb') as picklefile3:
    pickle.dump(bottom25, picklefile3)

## Top 5 Most Viewed for each Year

In [None]:
#Drop the rows where transcript is null
len(Data[(Data['transcript'].isnull())] ) 

In [None]:
mask = Data['transcript'].isnull()

In [None]:
Data = Data[~mask]

In [None]:
Data.columns

In [None]:
len(Data[(Data['transcript'].isnull())] ) 

In [None]:
Data['tags'].head(5)

In [None]:
Data['headline'].head(5)

#### Cleaning Up more rows

#### No.of Years

In [None]:
Data.year_filmed.nunique()

#### No. of Talks by Actual Year of Talk 

In [None]:
Data['transcript'].head(5)

#### Correcting A Headline

In [None]:
Data.sort_values(by=['Pub_Year','views_as_of_06162017'],ascending=False).groupby('Pub_Year').head(5)

In [None]:
YearlyTop5 = Data.sort_values(by=['Pub_Year','views_as_of_06162017'],ascending=False).groupby('Pub_Year').head(5)

In [None]:
with open('../Data/YearlyTop5.pkl', 'wb') as picklefile4:
    pickle.dump(YearlyTop5, picklefile4)

In [None]:
Data[(Data['year_filmed']==18.41)]

In [None]:
Data.shape

### No. of unique speakers

In [None]:
Data.speaker.nunique()

#### No. of events

In [None]:
Data.event.nunique()

In [None]:
text =Data['transcript'].head(1)

In [None]:
text[0] 

r'(\d{1,2}:\d{2})' Explanation for removing time (eg. 2:41 or 15:20)  
( ) - for exact pattern match --> matches all elements in the exact order within ( )  
\d{1,2} - digits between 1 to 2 digits  
: - digits followed by :  
\d{2} -  Exactly 2 digits after :  

In [None]:
def StripTranscript(x):
        p = re.compile(r'(\d{1,2}:\d{2})')
        a = p.sub('', x)
        a=a.replace('\n',' ')
       
        #a=a.replace("'","") # don't include as it is included in stop words
        a=a.replace('(Laughter)', ' ')
        a=a.replace('(Applause)', ' ')
        lista = a.split()
        for i in range(len(lista)):
            if lista[i] =='\r':
                if lista[i+1]=='\r':
                    lista[i]=lista[i].replace('\r','')
                else:
                    lista[i]=lista[i].replace('\r',' ')      
        a=" ".join(lista)
        return  a


In [None]:
StripTranscript(text[0])

In [None]:
StripTranscript(text[0][200:300])

In [None]:
Data['transcript'] = Data['transcript'].map(StripTranscript)