# EDA

## Plan:

- Hist plots and correlation for continous variables
- Value counts for catagorical variables
- Groupby on catagorical variables?
- Pivot tables to see correlations with Evergreen or not

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv("data_cleaned.csv")

In [None]:
pd.set_option("display.max_columns", None)
df.head()

In [None]:
df.columns

In [None]:
df_cat=df[['alchemy_category','hasDomainLink','is_news','lengthyLinkDomain','news_front_page','url_simp','label']]

df_cont=df[[ 'alchemy_category_score',
       'avglinksize', 'compression_ratio', 'frameTagRatio','html_ratio', 'image_ratio','linkwordscore','non_markup_alphanum_characters',
       'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
       'spelling_errors_ratio','commonlinkratio']]

In [None]:
for i in df_cont.columns:
    plt.hist(df_cont[i])
    plt.title(i)
    plt.show()

Perhaps we should take the non-normal distributions and normalize them

In [None]:
corr=df_cont.corr()
corr

In [None]:
plt.figure(figsize=(24,10))
sns.set(font_scale=1.5)
sns.heatmap(corr, xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap='coolwarm',
           vmin=-0.7,
            vmax=0.7,
            center=0,
           annot = True)

In [None]:
# Creating pivot table to find correlation between continuous variables and label(Evergreen or not)

In [None]:
training=df[[ 'alchemy_category_score',
       'avglinksize', 'compression_ratio', 'frameTagRatio','html_ratio', 'image_ratio','linkwordscore','non_markup_alphanum_characters',
       'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
       'spelling_errors_ratio','commonlinkratio','label']]

In [None]:
# Comparing EverGreen or not by differnet continuous variables
pd.pivot_table(training,index="label",values=['alchemy_category_score',
       'avglinksize', 'compression_ratio', 'frameTagRatio','html_ratio', 'image_ratio','linkwordscore','non_markup_alphanum_characters',
       'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
       'spelling_errors_ratio','commonlinkratio'])

I don't see a particularly large correlation between any variables and the label of if the website is Evergreen or not. However, the few that stand out are:
- Compression ratio(higher in non-evergreen)
- Image_ratio(higher in non-evergreen)
- linkwordscore(higher in non-evergreen)
- non_markup_alphanum_characters(higher in evergreen)
- number of links(higher in evergreen)

## Value Count charts for Catagorical Variables

In [None]:
for i in df_cat.columns:
    sns.countplot(data=df_cat,x=i)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Let's replace the "?" values in the alchemy category with "other" and also drop categories labeled as unknown. 

df["alchemy_category"]=df["alchemy_category"].apply(lambda x:x.replace("?","other"))
df=df[df["alchemy_category"]!="unknown"]

In [None]:
df.info()

In [None]:
# Let's also replace the "?" in news_front_page to 0 because that is the mode. 
df["news_front_page"]=df["news_front_page"].apply(lambda x:x.replace("?","0"))

In [None]:
# Let's re-run the df_cat and df_cont and training dataframes to update it with the changes we have made

df_cat=df[['alchemy_category','hasDomainLink','is_news','lengthyLinkDomain','news_front_page','url_simp','label']]

df_cont=df[[ 'alchemy_category_score',
       'avglinksize', 'compression_ratio', 'frameTagRatio','html_ratio', 'image_ratio','linkwordscore','non_markup_alphanum_characters',
       'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
       'spelling_errors_ratio','commonlinkratio']]
training=df[[ 'alchemy_category_score',
       'avglinksize', 'compression_ratio', 'frameTagRatio','html_ratio', 'image_ratio','linkwordscore','non_markup_alphanum_characters',
       'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
       'spelling_errors_ratio','commonlinkratio','label']]

In [None]:
# Let's look more specifically at the catagorical columns and their relationship with whether the website is evergreen
# or not. 
# Let's do this by grouping by the column and taking the mean of the label. 

In [None]:
df.groupby("alchemy_category").mean()["label"]

In [None]:
# Creating barplots showing correlation between catagorical variables and label
plt.figure(figsize=(16,8))
for i in df_cat.columns:
    sns.barplot(data=df_cat,x=i,y="label")
    plt.xticks(rotation=45) 
    plt.show()


## Feature Engineering

In [None]:
df.columns

In [None]:
# First let's change the null values to "?" and remove all nulls

df=df.replace('?',np.NaN)
# Let's also change the "others" value i set in alchemy category to null 

df["alchemy_category"]=df["alchemy_category"].replace('other',np.NaN)

In [None]:
df.isna().sum()

# Let's drop all na values

df.dropna(axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Let's simplify the url_simp category as there are too many different categories.

df["url_simp"].value_counts().head(30)

In [None]:
df["url_simp"].nunique()

In [None]:
# To make the model count we have to change the column to the amount of times the url was entered.
# Let's create categories based on the count of each url 

In [None]:
for i in df

In [None]:
df["url_simp"].unique().value_counts()

In [None]:
df_groupby=df.groupby("url_simp").count()

In [None]:
df_groupby.reset_index(inplace=True)

In [None]:
df_groupby.head()

In [None]:
df_groupby.info()

In [None]:
def count_url():
    if df_groupby["url"]>90:
        x=="A"
    elif df_groupby["url"]>50:
        x=="B"
    elif df_groupby["url"]>30:
        x=="C"
    elif df_groupby["url"]>15:
        x=="D"
    elif df_groupby["url"]>10:
        x=="E"
    elif df_groupby["url"]>5:
        x=="F"
    elif df_groupby["url"]>2:
        x=="G"
    else:
        x=="H"
    return x

In [None]:
#df["count_url"]=df_groupby["url_simp"].apply(lambda x:counturl(x))

In [None]:
df.head()

In [None]:
#df['url_category']

In [None]:
url_count_df = pd.DataFrame(df['url_simp'].value_counts()).reset_index()
url_count_dict = {}
for i in range(url_count_df.shape[0]):
    url_count_dict[url_count_df['index'][i]] = url_count_df['url_simp'][i]
url_count_dict

In [None]:
url_count = []
for i in range(df.shape[0]):
    url = df['url_simp'].iloc[i]
    count = url_count_dict[url]
    url_count.append(count)

df['url_count'] = url_count
df.head()

In [None]:
def categorize_url(url_count):
    if url_count>90:
        cat=="A"
    elif url_count>50:
        cat=="B"
    else:
        cat=="C"
    return cat