In [133]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os as os

#ML Regression, Decision Trees
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

#ML PCA
from scipy.cluster import hierarchy
import seaborn as sns
from sklearn import decomposition, preprocessing, cluster, tree
import pydotplus
from yellowbrick.cluster.silhouette import SilhouetteVisualizer

# Preprocessing Data

&nbsp; With a more robust dataset obtained in Step 2, we will proceed to clean and generate two base DataFrame to be used in the following steps.

In [144]:
def preprocessing(df):  

    #drop duplicated columns
    drop_column=['description2', 'Unnamed: 0', 'link', 'author_y', 'title','url', 'title_1']
    df=df.drop(drop_column,axis=1)
    
    #renaming
    column_names=['author', 'date_recorded', 'views', 'likes', 'title',
       'description_1', 'duration_seg', 'date_released', 'keywords','description_2']
    df.columns=column_names

    #mm
    #MODIFYING COLUMN: date_recorded
    #
    df['date_recorded']= pd.to_datetime(df['date_recorded'], format='%B %Y')
  
    ##separate data into new Column
    list_months=[]
    list_years=[]
    for i in range(df.shape[0]):
        list_months.append(df['date_recorded'][i].month)
        list_years.append(df['date_recorded'][i].year)
    df['date_recorded_year']=list_years
    df['date_recorded_month']=list_months

    #
    #MODIFYING COLUMN: date_released
    #
    column='date_released'
    df[column]= pd.to_datetime(df[column], format='%Y-%m-%d %H:%M:%S')

    ##separate data into new Column
    list_months=[]
    list_years=[]
    list_hours=[]
    list_minutes=[]
    for i in range(df.shape[0]):
        list_months.append(df[column][i].month)
        list_years.append(df[column][i].year)
        list_hours.append(df[column][i].hour)
        list_minutes.append(df[column][i].minute)
    df[column+'_year']=list_years
    df[column+'_month']=list_months
    df[column+'_hour']=list_hours
    df[column+'_minute']=list_minutes

    #
    #MODIFYING COLUMN: 'keywords'
    #
    df_key=df.keywords
    i=0
    df_result=pd.DataFrame()
    ##transforming line into string
    for line in df_key:
        line=(str(line).replace("[","").replace("]","").split(','))
        new_line=[]
        ##removing additional spaces in words and converting the into lower case
        for word in line:
            word=word.lower().replace(' ', '')[1:-1]
            new_line.append(word)
        ##transforming line into string
        new_line=str(new_line).replace("[","").replace("]","")
        ##writting line into dataframe
        df_result.at[i,'keywords2']=new_line
        i=i+1
  
    df=pd.concat([df,df_result], axis=1)

    #drop initial columns
    drop_columns=['date_recorded','date_released', 'keywords']
    df=df.drop(drop_columns, axis=1)
    
    return(df)

In [135]:
def create_dummies_file(df):
    '''
    This function does: 
    1) convert df.keywords into dummy columns
    2) creates a file called ''in order to manually map new categories from keywords
    '''
    #converting keywords into dummy columns
    df2=df.keywords2.str.get_dummies(',')

    #joining with df
    df=pd.concat([df,df2], axis=1)
    
    #removing 'ted' column
    column_to_drop=df.columns[362]
    df2=df.drop(column_to_drop, axis=1)

    #counting dummies and creating file to rename categories
    dummy_columns=pd.Series(np.arange(15,349,1))[1:]
    df_dummies=df.iloc[:,dummy_columns].sum().reset_index()
    df_dummies.columns=['keyword', 'sum']
    df2=df_dummies.copy()
    (df2
     .groupby(['keyword'])
     .agg({'sum':'sum'})
    )
    df2=df2.sort_values(by='sum', ascending=False)
    cwd=os.getcwd()
    df2.to_csv(cwd+'/keywords.csv')
    return df

In [136]:
def dummy_data(df):
    '''
    This function takes the keyword_categories.csv file and creates a new dataframe 'df_dummies' to analyze the keywords
    '''
    # cwd=os.getcwd()
    # categories=pd.read_csv(cwd+'/keywords_categories.csv')
    #for github
    categories=pd.read_csv('https://github.com/aaas24/code_library/raw/main/ted_talks/2_preprocessing/keywords_categories.csv')

    #transforming categories
    new_cat=(categories.columns.values.tolist())
    dic={key: None for key in new_cat}

    ##creating dictionary with categories file
    for column in range (0,categories.shape[1]):
        dic_values=[]
        key=new_cat[column]
        for row in range (0,categories.shape[0]):
            value=categories.iloc[row,column]
            if value is np.nan:
                pass
            else:
                value=value.replace(' ', '')[1:-1]
                dic_values.append(value)
        dic.update({key:dic_values})

    ##adding column to df with 
    dummy_columns=pd.concat([df.iloc[:,16:349], df[['likes', 'views']]], axis=1)
    df_dummies=dummy_columns.iloc[:,:-2].sum().reset_index()
    df_dummies.columns=['sub_category', 'num_talks']
    
    #adding categories to subcategories
    list_categories=[]
    for i in range (0, len(set(df_dummies['sub_category']))):
        keyword=df_dummies['sub_category'][i][2:-1]
        ###find category of keyword in dictionary
        for key, value_list in dic.items():
            for x in value_list:
                if keyword==x:
                    category=key
        ###add category to list
        list_categories.append(category)
    
    ##add list_categories to df
    df_dummies['category']=list_categories

    #add num likes and views
    list_likes=[]
    list_views=[]
    for row in range (0,df_dummies.shape[0]):
        subcategory=df_dummies.iloc[row,0]
        df2=dummy_columns[[subcategory,'likes','views']]
        df2.columns=['A', 'likes','views']
        df3=(df2
             .query('A>0')
             .groupby('A')
             .agg({'likes': ['sum'], 'views':['sum']})
            )
        
        list_likes.append(df3.iloc[0,0])
        list_views.append(df3.iloc[0,1])
    #add lists to df_dummies
    df_dummies['likes']=list_likes
    df_dummies['views']=list_views
    return df_dummies

In [137]:
def main():
    #load data
    raw_data=pd.read_csv('https://github.com/aaas24/code_library/raw/main/ted_talks/1_raw_data/final_raw_data.csv')
    df=raw_data.copy()

    #clean data
    df=preprocessing(df)
    df=create_dummies_file(df)
    df_dummies=dummy_data(df)
    return(df, df_dummies)
  


In [138]:
if __name__ == '__main__':
  main()

In [139]:
df=main()[0]
df.head(5)

Unnamed: 0,author,views,likes,title,description_1,duration_seg,description_2,date_recorded_year,date_recorded_month,date_released_year,...,'water','weather','windenergy','women','womeninbusiness','work','work-lifebalance','writing','youth','ted'
0,Ozawa Bineshi Albert,404000,12000,Climate action needs new frontline leadership,"""We can't rely on those who created climate ch...",834,"""We can't rely on those who created climate ch...",2021,12,2022,...,0,0,0,0,0,0,0,0,0,1
1,Sydney Iaukea,214000,6400,The dark history of the overthrow of Hawaii,"""On January 16th, 1895, two men arrived at Lil...",0,"""On January 16th, 1895, two men arrived at Lil...",2022,2,2022,...,0,0,0,1,0,0,0,0,0,1
2,Martin Reeves,412000,12000,Why play is essential for business,"""To thrive in today's competitive economy, you...",665,"""To thrive in today's competitive economy, you...",2021,9,2022,...,0,0,0,0,0,1,0,0,0,1
3,James K. Thornton,427000,12000,Why is China appointing judges to combat clima...,"""Why is China appointing thousands of judges t...",695,"""Why is China appointing thousands of judges t...",2021,10,2022,...,0,0,0,0,0,0,0,0,0,1
4,Mahendra Singhi,2400,72,Cement's carbon problem -- and 2 ways to fix it,"""Cement is vital to modernizing all kinds of i...",671,"""Cement is vital to modernizing all kinds of i...",2021,10,2022,...,0,0,0,0,0,0,0,0,0,1


In [140]:
df_dummies=main()[1]
df_dummies.head(5)

Unnamed: 0,sub_category,num_talks,category,likes,views
0,'3dprinting',9,technology,201574,6655100
1,'activism',352,values & emotions,21752759,714057797
2,'addiction',20,health,1870500,60982000
3,'africa',197,global,9097799,299541000
4,'aging',93,society,8152092,269034199


# Data Exploration

## PCA - Visualize High Dimension Data

In [141]:
df.head(3)

Unnamed: 0,author,views,likes,title,description_1,duration_seg,description_2,date_recorded_year,date_recorded_month,date_released_year,...,'water','weather','windenergy','women','womeninbusiness','work','work-lifebalance','writing','youth','ted'
0,Ozawa Bineshi Albert,404000,12000,Climate action needs new frontline leadership,"""We can't rely on those who created climate ch...",834,"""We can't rely on those who created climate ch...",2021,12,2022,...,0,0,0,0,0,0,0,0,0,1
1,Sydney Iaukea,214000,6400,The dark history of the overthrow of Hawaii,"""On January 16th, 1895, two men arrived at Lil...",0,"""On January 16th, 1895, two men arrived at Lil...",2022,2,2022,...,0,0,0,1,0,0,0,0,0,1
2,Martin Reeves,412000,12000,Why play is essential for business,"""To thrive in today's competitive economy, you...",665,"""To thrive in today's competitive economy, you...",2021,9,2022,...,0,0,0,0,0,1,0,0,0,1


In [142]:
#dropping categorical columns
df_model=df.drop(['author','title','description_1', 'description_2', 'keywords2'], axis=1).iloc[:,:9]
df_model.head(3)

Unnamed: 0,views,likes,duration_seg,date_recorded_year,date_recorded_month,date_released_year,date_released_month,date_released_hour,date_released_minute
0,404000,12000,834,2021,12,2022,2,9,41
1,214000,6400,0,2022,2,2022,2,10,13
2,412000,12000,665,2021,9,2022,2,9,51


In [143]:
X = df_model
std = preprocessing.StandardScaler()
X_std = pd.DataFrame(std.fit_transform(X), columns=X.columns)
X_std

AttributeError: 'function' object has no attribute 'StandardScaler'

In [None]:
pca = decomposition.PCA()
pca_X = pd.DataFrame(pca.fit_transform(X_std), columns=[f'PC{i+1}' for i in range(len(X.columns))])
pca_X

In [None]:
#variance or relevance of PCAs. In this case the first 3 hold ~50% of representation of the data
pca.explained_variance_ratio_


In [None]:
# Components
# First component is .36 * Views + .36 * likes + 0.19 * Dur ... etc
pca.components_[0]

In [None]:
# What columns make up the components 1 & 2?
# 1 - Views & Likes
# 2 - Recorded & Released Year
(pd.DataFrame(pca.components_, columns=X.columns)
 .iloc[:2]
 .plot.bar()
 .legend(bbox_to_anchor=(1,1)))

In [None]:
# What columns make up the components 3 & 4?
(pd.DataFrame(pca.components_, columns=X.columns)
 .iloc[2:4]
 .plot.bar()
 .legend(bbox_to_anchor=(1,1)))

In [None]:
# Plot with Seaborn
x='PC1'
y='PC2'
val='date_released_month'
sns.scatterplot(x=x, y=y, 
                data=pca_X.assign(val=X[val]), 
                hue='val')

## Clustering

In [None]:
inerts = []
for i in range(2, 20):
    k = cluster.KMeans(n_clusters=i, random_state=42)
    k.fit(X_std)
    inerts.append(k.inertia_)
    
pd.Series(inerts).plot()

In [None]:
start, end = 2, 10
cols = 2
rows = ((end - start) // cols)
fix, axes = plt.subplots(rows, cols, figsize=(12,8))
axes = axes.reshape(cols * rows)
for i, k in enumerate(range(start, end), 0):
    ax = axes[i]
    sil = SilhouetteVisualizer(cluster.KMeans(n_clusters=k, random_state=42), ax=ax)
    sil.fit(X_std)
    sil.finalize()
plt.tight_layout()

In [None]:
# Try another mechanism
fig, ax = plt.subplots(figsize=(10,8))
hierarchy.dendrogram(hierarchy.linkage(X_std, method='ward'),
                    truncate_mode='lastp', p=20, show_contracted=True)
pass  # here to hide return value of above

In [None]:
# going to choose 4 clusters
k9 = cluster.KMeans(n_clusters=4, random_state=42)
k9.fit(X_std)
labels = k9.predict(X_std)

In [None]:
labels

In [None]:
X.assign(label=labels)

In [None]:
(X.assign(label=labels)
  .groupby('label')
  .agg(['mean', 'var'])
  .T
)

In [None]:
# how many in each cluster?
pd.Series(labels).value_counts().sort_index()

In [None]:
# Add coloring to aid impact to clusters
(X.assign(label=labels)
  .groupby('label')
  .mean()
  .T
 .style.background_gradient(cmap='RdBu', axis=1)
)

### Clusters
* 0 - Newer videos released in fall
* 1 - Newer videos released in earlier in the year
* 2 - Older videos, longer duration in seg
* 3 - Highest views & likes

In [None]:
# describe a column for each label
(X.assign(label=labels)
  .groupby('label')
  .date_recorded_year
  .describe()
)

In [None]:
# describe a label in a cluster
(X.assign(label=labels)
 .query('label == 0')
 .describe()
)

In [None]:
# Plot with Seaborn
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,8))
sns.scatterplot(x='PC1', y='PC2', 
                data=pca_X.assign(label=labels),
                cmap='Pastel',
                hue='label', ax=ax)

In [None]:
from bokeh.io import output_notebook
from bokeh import models, palettes, transform
from bokeh.plotting import figure, show

def bokeh_scatter(
    x,
    y,
    data,
    hue=None,
    label_cols=None,
    size=None,
    legend=None,
    alpha=0.5,
):
    """
    x - x column name to plot
    y - y column name to plot
    data - pandas dataframe
    hue - column name to color by (numeric)
    legend - column name to label by
    label_cols - columns to use in tooltip (None all in dataframe)
    size - size of points in screen space unigs
    alpha - transparency
    """
    output_notebook()
    circle_kwargs = {}
    if legend:
        circle_kwargs["legend"] = legend
    if size:
        circle_kwargs["size"] = size
    if hue:
        color_seq = data[hue]
        mapper = models.LinearColorMapper(
            palette=palettes.viridis(256),
            low=min(color_seq),
            high=max(color_seq),
        )
        circle_kwargs[
            "fill_color"
        ] = transform.transform(hue, mapper)
    ds = models.ColumnDataSource(data)
    if label_cols is None:
        label_cols = data.columns
    tool_tips = sorted(
        [
            (x, "@{}".format(x))
            for x in label_cols
        ],
        key=lambda tup: tup[0],
    )
    hover = models.HoverTool(
        tooltips=tool_tips
    )
    fig = figure(
        tools=[
            hover,
            "pan",
            "zoom_in",
            "zoom_out",
            "reset",
        ],
        toolbar_location="below",
    )

    fig.circle(
        x,
        y,
        source=ds,
        alpha=alpha,
        **circle_kwargs
    )
    show(fig)
    return fig

res = bokeh_scatter("PC1","PC2", 
                    data=pd.concat([pca_X, X], axis=1).assign(label=labels), hue='label', size=10,
                    label_cols=list(X.columns)+['label'],
                   legend='label')

## Exploring

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(df_graph.corr(), cmap='RdBu', vmin=-1, vmax=1, annot=True, square=True, ax=ax)

## Exploring: date recorded

In [None]:
df_graph=df[df.date_recorded_year>2000]
df_graph.date_recorded_month.hist()

In [None]:
df_graph.date_released_month.hist()

In [None]:
#improving graph

data=df[df.date_recorded_year>2000]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)

#plotting first histogram
ax=(data
    .groupby(['date_recorded_month'])
    .likes
    .count()
    .plot(x='date_recorded_month', kind = 'bar',alpha=0.6, ax=ax,) 
)
#plotting second hidtogram
ax=(data
    .groupby(['date_released_month'])
    .likes
    .count()
    .plot(x='date_released_month', kind = 'bar',alpha=0.5, ax=ax, color='#76725e') 
)
#improving labes
ax.set_xticks(ticks=range(0,12,1))  
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Ago', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_xlabel('')
ax.set_ylabel('Count Videos ')
#styling grid, leyend and title
plt.title('Monthly Videos Recorded vs Released', ha='center', fontsize='xx-large')
plt.legend(["Recorded", "Released"], loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), borderaxespad=2.6, facecolor="white")
ax.set_facecolor("white")
plt.grid(axis='y', color='black', alpha=.2)

In [None]:
title='Number of Videos Released by Year'
y_label='Num Videos'

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
ax=df.date_released_year.hist(alpha=0.6)
plt.plot(ax=ax)
plt.grid(axis='x')
ax.set_facecolor("white")
ax.set_ylabel(y_label)
plt.grid(axis='y', color='black', alpha=.2)
plt.title(title, ha='center', fontsize='xx-large')

In [None]:
title='Number of Videos Recorded per Year'
y_label='Num Videos'

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
ax=df.date_recorded_year.hist(alpha=0.6)
plt.plot(ax=ax)
plt.grid(axis='x')
ax.set_facecolor("white")
ax.set_ylabel(y_label)
plt.grid(axis='y', color='black', alpha=.2)
plt.title(title, ha='center', fontsize='xx-large')

In [None]:
# Are there outliers
title='Most Videos were Recorded Between 2012-2020'
ax=df.date_recorded_year.plot.box()
plt.plot(ax=ax)
plt.grid(axis='x')
ax.set_facecolor("white")
plt.grid(axis='y', color='black', alpha=.2)
plt.title(title, ha='center', fontsize='xx-large')

In [None]:
df_graph=df[df.date_recorded_year<2000]


In [None]:
#removing outliyers
df_graph=df[df.date_recorded_year>2000]
sns.lmplot(x='date_recorded_year', y='likes', data=df_graph, x_jitter=1, scatter_kws={'alpha':.2})

## Exploring: duration of videos

In [None]:
# Are there outliers
df.duration_seg.plot.box()

In [None]:
#relationship between duration and likes
df_graph=df
sns.relplot(x='duration_seg', y='likes', data=df_graph, alpha=.1)

In [None]:
#Insight: during pandemic years (2020-2021) 
df_graph=df[df.date_recorded_year==2019]
sns.relplot(x='duration_seg', y='likes', data=df_graph, col='date_recorded_year', col_wrap=2, alpha=.1)

In [None]:
df_graph = pd.pivot_table(data=df, 
               index=['date_recorded_year'], 
               values=['likes','views'], 
               aggfunc={'likes':[np.sum],'views':np.sum}
        )
df_graph.hist()

In [None]:
not_dummy_columns=pd.Series(np.arange(0,14,1))
df_graph=df.iloc[:,not_dummy_columns].reset_index().drop('index', axis=1)
df_graph.corr()

## Exploring: Keywords categories

In [None]:
df_dummies=main()[1]
df_dummies.head(3)

In [None]:
df_graph=(
            df_dummies.
            groupby(['category'])
            .agg({'likes':['sum'],'views':['sum'], 'num_talks':['sum']})
            
)
df_graph.columns=['likes', 'views', 'num_talks']
df_graph=df_graph.sort_values(by=['num_talks'], ascending=False)

#plot
sns.scatterplot(data=df_graph, x="likes", y="views", size="num_talks", legend=True, hue='category', alpha=0.5, sizes=(40, 400)) 
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
sns.axes_style({
    'axes.facecolor': 'white',
    'axes.edgecolor': 'black',
    'axes.grid': False,
    'figure.facecolor': 'white',
     'grid.color': 'white',
     'grid.linestyle': '-',
     'font.sans-serif': 'Arial',
     'grid.color': '#ffffff'

})
sns.set(rc={"figure.figsize":(9 , 9)}) #(width,height)
plt.show()

In [None]:
df_graph=(
            df_dummies.
            groupby(['sub_category'])
            .agg({'likes':['sum'],'views':['sum'], 'num_talks':['sum']})
            
)
df_graph.columns=['likes', 'views', 'num_talks']
df_graph=(df_graph
    .sort_values(by=['num_talks'], ascending=False)
    .iloc[1:20,:]
)

# df_graph
# plot
sns.scatterplot(data=df_graph, x="likes", y="views", size="num_talks", legend=True, hue='sub_category', alpha=0.5, sizes=(40, 400)) 
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
sns.axes_style({
    'axes.facecolor': 'white',
    'axes.edgecolor': 'black',
    'axes.grid': False,
    'figure.facecolor': 'white',
     'grid.color': 'white',
     'grid.linestyle': '-',
     'font.sans-serif': 'Arial',
     'grid.color': '#ffffff'

})
sns.set(rc={"figure.figsize":(9 ,9)}) #(width,height)
plt.show()

In [None]:
dummy_columns=pd.Series(np.arange(15,349,1))[1:]
df_d=pd.concat([df.iloc[:,dummy_columns],df[['likes', 'views', 'date_released_month']]], axis=1)
df_d.head(3)

In [None]:
df_d.shape

# Analysis

In [None]:
# #what constitude a good video based on views?
y=1000
df_graph=df.views.apply(lambda x: round(x/y,0))
df_graph.describe()

### Top 10 Liked Videos

In [None]:
#build data: These are the 75% most liked videos
y_var='likes'
df_grap = (
    (df.groupby(['title','author','date_recorded_year','views'])[y_var].sum().reset_index())
    .sort_values([y_var],ascending=[False])
    ).reset_index()
df_grap=df_grap.drop('index', axis=1)
df_grap = df_grap[df_grap[y_var] > 65000]
df_grap.head(25)

### Top 10 Authors

In [None]:
#Modifiable variables
y_var='views'
x_var1='author'
x_var2='likes'

#build data
df_grap = (
    (df.groupby([x_var1, x_var2])[y_var].sum().reset_index())
    .sort_values([y_var],ascending=[False])
    ).reset_index().head(20)
df_grap=df_grap.drop('index', axis=1)
df_grap

Areas of improvements:
    
    1) More information on the authors. Understanding age, gender and nationality of authors, may answer questions related to diversity of the speakers. This data could be parcially scrapped from Wikipedia as there is a dedicated website that tracks this information. 
    https://en.wikipedia.org/wiki/List_of_TED_speakers

# ML Predicting model for likes based on content labels to answer:
Will certain labels increase the likes on a content

In [None]:
# #what constitude a good video based on likes?
y=1000
df_graph=df.likes.apply(lambda x: round(x/y,0))
df_graph.describe()

In [None]:
#verifying no NAN in data feeding model
df[df.likes.isnull()==True]

In [None]:
#create target

#we define TARGET a well performing video if it is above 75% percentile. So the model should predict if a video will
#perform above 75% percentile
threshold= np.percentile(df_d.likes, 75)

#create target column
df_d['target']=[1 if x>threshold else 0 for x in df.likes]

In [None]:
#drop multicolinearity columns
df_d=df_d.drop(['likes', 'views'], axis=1)

In [None]:
data=df_d.copy()
data.head(3)

In [None]:
#Balance data
data.target.value_counts()

In [None]:
positive_labels = data[data.target==1]
num_positive_labels = positive_labels.shape[0]
num_positive_labels

In [None]:
negative_labels = data[data.target==0].sample(num_positive_labels)
negative_labels.shape

In [None]:
balanced_data =  positive_labels.append(negative_labels)
balanced_data.target.value_counts()

In [None]:
## Splitting data into test splits

In [None]:
y = balanced_data.pop('target')
X = balanced_data

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size = 0.33)
X_train.head()

## Linear Regression

In [None]:
# fit a model
clf = LogisticRegression(penalty='l2').fit(X_train, y_train)
# predict probabilities
predictions = clf.predict_proba(X_test)[:, 1]

In [None]:
# Feature Importance
feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure(figsize=(10, 15))
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)

plt.show()

### Model Evaluation

In [None]:
# Predict probabilities given test data
y_pred = clf.predict_proba(X_test)
pred_reg=y_pred

In [None]:
# calculate scores
auc = roc_auc_score(y_test, predictions)

# calculate roc curves
fpr, tpr, _ = roc_curve(y_test, predictions)

plt.figure(figsize=(15, 10))
# plot horizontal line 
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
# axis labels
plt.xlabel('FPR')
plt.ylabel('TPR')
# show the legend
plt.legend(loc='lower right')
# show the plot
plt.show()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=10)

print(dt_model)

dt_model = dt_model.fit(X_train,y_train)
pred_dt = dt_model.predict_proba(X_valid)[:, 1]

In [None]:
from sklearn.metrics import classification_report
pred_dt_binary = dt_model.predict(X_valid)
print(classification_report(y_valid, pred_dt_binary))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
print(rf_model)

rf_model = rf_model.fit(X_train, y_train)
pred_rf = rf_model.predict_proba(X_valid)[:, 1]
print(classification_report(y_valid, pred_rf.round(0)))

In [None]:
#Feature Component
from sklearn.inspection import permutation_importance
rf_model.feature_importances_
plt.barh(X.columns.values, rf_model.feature_importances_)

## XGBoost

In [None]:
#code to fix error taken from: https://stackoverflow.com/questions/43579180/feature-names-must-be-unique-xgboost
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_valid = X_valid.loc[:,~X_valid.columns.duplicated()]

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model = xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict_proba(X_valid)[:, 1]

In [None]:
#Feature Component
from xgboost import plot_importance
# plot feature importance
plot_importance(xgb_model)
plt.show()

## Comparing Tree Models

In [None]:
def create_roc_plot(name, predictions):
  auc = roc_auc_score(y_valid, predictions).round(2)
  fpr, tpr, _ = roc_curve(y_valid, predictions)

  plt.figure(figsize=(5, 4))
  plt.plot([0, 1], [0, 1], linestyle='--')  # plot horizontal line 
  plt.plot(fpr, tpr, label='{} AUC = {}'.format(name, auc)) # plot the roc curve for the model
  plt.xlabel('FPR')
  plt.ylabel('TPR')
  plt.legend(loc='lower right')  # show the legend
  plt.show() # show the plot
  return None

In [None]:
# create_roc_plot('Regression', pred_reg)
create_roc_plot('Decision Tree', pred_dt)
create_roc_plot('Random Forest', pred_rf)
create_roc_plot('XGBoost', pred_xgb)

In [None]:
# NLP Machine Learning on Description
#Spacy to capture entities from description

In [None]:
import spacy
from spacy import displacy


text=df.description_1[0]

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style='ent', jupyter=True)


list_text = []
list_ent = []

for ent in doc.ents:
    print(ent.text, ent.label_)
    list_text.append(ent.text)
    list_ent.append(ent.label_)
    

test_df = pd.DataFrame(list_text, columns=['text'])
test_df['ent'] = list_ent
test_df

print(text)
print(test_df)