In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

# Load Data

In [None]:
import io
news_popularity = pd.read_csv(io.BytesIO(uploaded['OnlineNewsPopularity.csv']))

First need to check the first 10 rows of the dataset to understand the dataset.

In [None]:
pd.set_option('display.max_columns', None)
news_popularity.head(10)


Now lets see the dimension of the dataset

In [None]:
news_popularity.shape

The dataset has 39644 rows and 60 variables and one target variable.

Now we need to check the quality of the dataset. 

First we need to know the variabels names

In [None]:
news_popularity = pd.DataFrame(news_popularity)

In [None]:
column_names = news_popularity.columns.tolist()

In [None]:
column_names

In [None]:
# Removed the url column from the dataframe.
df = news_popularity.drop("url", axis = 1)
df.head()

# Understand the Data

In [None]:
data_types = pd.DataFrame(df.dtypes, columns=['data_type'])
data_types

From the above output we can say that all the variables are numerical and according to the need we will convert the binary variables into category.

In [None]:
# Satistics of data
df.describe()

By checking the summary statistics of the dataset we got to know that their might be outliers, implausible data and some of datatype is wrongly interpreted by Python


# Check Missing Values




In [None]:
features_with_na = [features for features in df.columns]

for features in features_with_na:
  print(features, np.round(df[features].isnull().mean(), 4), ' % Missing Values')

# Separating Numerical & Categorical Variables 

In [None]:
my_features = [features for features in df.columns]

# Separating continuous and discrete variables
continuous_features = []
discrete_features = []

for feature in my_features:
    unique_values = df[feature].nunique()
    if unique_values > 3:  # Adjust the threshold depending on your dataset
        continuous_features.append(feature)
    else:
        discrete_features.append(feature)

print("Continuous Variables Count: {}, Continuous features: {}".format(len(continuous_features), continuous_features))
print("Discrete Variables Count: {}, Discrete features: {}".format(len(discrete_features), discrete_features))


We have 45 continuous variables and 15 categorical variables. 


In [None]:
# Change the datatype of 14 Categorical variables
for feature in [' data_channel_is_lifestyle', ' data_channel_is_entertainment', ' data_channel_is_bus', ' data_channel_is_socmed', ' data_channel_is_tech', ' data_channel_is_world', ' weekday_is_monday', ' weekday_is_tuesday', ' weekday_is_wednesday', ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday', ' weekday_is_sunday', ' is_weekend']:
    df[feature] = df[feature].astype('category')

#check again
data_types = pd.DataFrame(df.dtypes, columns=['data_type'])
data_types

In [None]:
df_continuous = df[continuous_features]
df_descrete = df[discrete_features]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot boxplot for each continuous feature
for feature in continuous_features:
    plt.figure()  # This will create a new figure for each feature
    sns.boxplot(data=df_continuous, x=feature)

As we can see there are outliers in most of the features. But first we need to check the implausible values in our dataset. 

In [None]:
print(df.columns)

# Check Implausible Values


In [None]:
# Define the plausible ranges for each feature
feature_ranges = {
    ' n_tokens_title': (1, None),
    ' n_tokens_content': (0, None),
    " n_unique_tokens": (0, 1),
    " n_non_stop_words": (0, 1),
    " n_non_stop_unique_tokens": (0, 1),
    " num_hrefs": (0, None),
    " num_self_hrefs": (0, None),
    " num_imgs": (0, None),
    " num_videos": (0, None),
    " average_token_length": (1, None),
    " num_keywords": (0, None),
    " kw_min_min": (0, None),
    " kw_max_min": (0, None),
    " kw_avg_min": (0, None),
    " kw_min_max": (0, None),
    " kw_max_max": (0, None),
    " kw_avg_max": (0, None),
    " kw_min_avg": (0, None),
    " kw_max_avg": (0, None),
    " kw_avg_avg": (0, None),
    " self_reference_min_shares": (0, None),
    " self_reference_max_shares": (0, None),
    " self_reference_avg_sharess": (0, None),
    " LDA_00": (0, 1),
    " LDA_01": (0, 1),
    " LDA_02": (0, 1),
    " LDA_03": (0, 1),
    " LDA_04": (0, 1),
    " global_subjectivity": (0, 1),
    " global_sentiment_polarity": (-1, 1),
    " global_rate_positive_words": (0, 1),
    " global_rate_negative_words": (0, 1),
    " rate_positive_words": (0, 1),
    " rate_negative_words": (0, 1),
    " avg_positive_polarity": (0, 1),
    " min_positive_polarity": (0, 1),
    " max_positive_polarity": (0, 1),
    " avg_negative_polarity": (-1, 0),
    " min_negative_polarity": (-1, 0),
    " max_negative_polarity": (-1, 0),
    " title_subjectivity": (0, 1),
    " title_sentiment_polarity": (-1, 1),
    " abs_title_subjectivity": (0, 1),
    " abs_title_sentiment_polarity": (0, 1),
    " shares": (0, None),
}

# Check the range for each feature and print which features have out of the given range values
for feature, (min_value, max_value) in feature_ranges.items():
    outside_range = False

    if min_value is not None:
        outside_range = outside_range or (df[feature] < min_value).any()

    if max_value is not None:
        outside_range = outside_range or (df[feature] > max_value).any()

    if outside_range:
        print(f"{feature} has values outside the given range.")


For rate features like n_unique_tokens, n_non_stop_words, and n_non_stop_unique_tokens, we impute out-of-range values with the minimum or maximum plausible value (0 or 1) as they represent proportions.
For average_token_length, since the minimum length of a word is 1, we impute values below 1 with 1.
For keyword-related features like kw_min_min, kw_avg_min, and kw_min_avg, we impute values below 0 with 0, because the number of shares cannot be negative.
This imputation strategy ensures that the values fall within the plausible ranges for each feature while preserving the original data as much as possible

In [None]:
# Impute out-of-range values with appropriate values
# We'll use the .loc[] method to modify the values in-place

# n_unique_tokens: Rate of unique words in the content, should be between 0 and 1
# Impute values < 0 with 0, and values > 1 with 1
df.loc[df[' n_unique_tokens'] < 0, ' n_unique_tokens'] = 0
df.loc[df[' n_unique_tokens'] > 1, ' n_unique_tokens'] = 1

# n_non_stop_words: Rate of non-stop words in the content, should be between 0 and 1
# Impute values < 0 with 0, and values > 1 with 1
df.loc[df[' n_non_stop_words'] < 0, ' n_non_stop_words'] = 0
df.loc[df[' n_non_stop_words'] > 1, ' n_non_stop_words'] = 1

# n_non_stop_unique_tokens: Rate of unique non-stop words in the content, should be between 0 and 1
# Impute values < 0 with 0, and values > 1 with 1
df.loc[df[' n_non_stop_unique_tokens'] < 0, ' n_non_stop_unique_tokens'] = 0
df.loc[df[' n_non_stop_unique_tokens'] > 1, ' n_non_stop_unique_tokens'] = 1

# average_token_length: Average length of the words in the content, should be >= 1
# Impute values < 1 with 1 (shortest possible word length)
df.loc[df[' average_token_length'] < 1, ' average_token_length'] = 1

# kw_min_min: Worst keyword (min. shares), should be >= 0
# Impute values < 0 with 0, as shares cannot be negative
df.loc[df[' kw_min_min'] < 0, ' kw_min_min'] = 0

# kw_avg_min: Worst keyword (avg. shares), should be >= 0
# Impute values < 0 with 0, as shares cannot be negative
df.loc[df[' kw_avg_min'] < 0, ' kw_avg_min'] = 0

# kw_min_avg: Avg. keyword (min. shares), should be >= 0
# Impute values < 0 with 0, as shares cannot be negative
df.loc[df[' kw_min_avg'] < 0, ' kw_min_avg'] = 0

In [None]:
# Again checking the range to see if there is any feature with outside the given range 
for feature, (min_value, max_value) in feature_ranges.items():
    outside_range = False

    if min_value is not None:
        outside_range = outside_range or (df[feature] < min_value).any()

    if max_value is not None:
        outside_range = outside_range or (df[feature] > max_value).any()

    if outside_range:
        print(f"{feature} has values outside the given range.")

No output, that means none of the features have out of range values. 

# Assessing the impact of outliers

In [None]:
for column in continuous_features:    
    q1 = df[column].quantile(0.25)    # First Quartile
    q3 = df[column].quantile(0.75)    # Third Quartile
    IQR = q3 - q1                            # Inter Quartile Range

    llimit = q1 - 1.5*IQR                       # Lower Limit
    ulimit = q3 + 1.5*IQR                        # Upper Limit

    outliers = df[(df[column] < llimit) | (df[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print(llimit)
    print(ulimit)
    print(IQR)

There are some features which have high number of outliers but could be plausible values. Therefore, to reduce the impact of it we will scale all the features.

In [None]:
df[' shares'].describe()

In [None]:
threshold = df[' shares'].quantile(0.95)  # Calculate the threshold for the top 5% most viral news
viral_news = df[df[' shares'] > threshold]  # Filter the DataFrame to get only the viral news

print("Top 5% viral news threshold:", threshold)
print("Number of viral news articles:", len(viral_news))


In [None]:
threshold2 = df[' shares'].quantile(0.99)  # Calculate the threshold for the top 1% most viral news
viral_news2 = df[df[' shares'] > threshold2]  # Filter the DataFrame to get only the viral news

print("Top 1% viral news threshold:", threshold2)
print("Number of viral news articles:", len(viral_news2))


In [None]:
# Create a scatterplot for 'n_tokens_content' vs 'shares'


plt.figure(figsize=(10, 6))

palette = {True: "mediumorchid", False: "cornflowerblue"}

# Create the scatterplot
sns.scatterplot(data=df, x=' n_tokens_content', y=' shares', hue=(df[' shares'] > threshold), palette=palette)

# Add a horizontal line representing the top 5% viral news threshold
plt.axhline(y=threshold, color='crimson', linestyle='--', label='Top 5% viral news threshold')

# Customize the plot appearance
plt.legend()
plt.title('Scatterplot of n_tokens_content vs shares', fontsize=16)
plt.xlabel('n_tokens_content', fontsize=14)
plt.ylabel('shares', fontsize=14)

# Display the plot
plt.show()



So we can notice here that the outliers can be a possible as people less tend to read the news which have high number of tokens

In [None]:
#n_non_stop_words

# Create a scatterplot for 'n_non_stop_words' vs 'shares'
plt.figure(figsize=(10, 6))

palette = {True: "mediumorchid", False: "cornflowerblue"}

# Create the scatterplot
sns.scatterplot(data=df, x=' n_non_stop_words', y=' shares', hue=(df[' shares'] > threshold), palette=palette)

# Add a horizontal line representing the top 5% viral news threshold
plt.axhline(y=threshold, color='crimson', linestyle='--', label='Top 5% viral news threshold')

# Customize the plot appearance
plt.legend()
plt.title('Scatterplot of n_non_stop_words vs shares', fontsize=16)
plt.xlabel('n_non_stop_words', fontsize=14)
plt.ylabel('shares', fontsize=14)

# Display the plot
plt.show()

In [None]:
# Create a scatterplot for 'kw_max_max' vs 'shares'
plt.figure(figsize=(10, 6))

palette = {True: "mediumorchid", False: "cornflowerblue"}

# Create the scatterplot
sns.scatterplot(data=df, x=' kw_max_max', y=' shares', hue=(df[' shares'] > threshold), palette=palette)

# Add a horizontal line representing the top 5% viral news threshold
plt.axhline(y=threshold, color='crimson', linestyle='--', label='Top 5% viral news threshold')

# Customize the plot appearance
plt.legend()
plt.title('Scatterplot of kw_max_max vs shares', fontsize=16)
plt.xlabel('kw_max_max', fontsize=14)
plt.ylabel('shares', fontsize=14)

# Display the plot
plt.show()

In [None]:
# Create a scatterplot for 'n_unique_tokens' vs 'shares'
plt.figure(figsize=(10, 6))

palette = {True: "mediumorchid", False: "cornflowerblue"}

# Create the scatterplot
sns.scatterplot(data=df, x=' n_unique_tokens', y=' shares', hue=(df[' shares'] > threshold), palette=palette)

# Add a horizontal line representing the top 5% viral news threshold
plt.axhline(y=threshold, color='crimson', linestyle='--', label='Top 5% viral news threshold')

# Customize the plot appearance
plt.legend()
plt.title('Scatterplot of n_unique_tokens vs shares', fontsize=16)
plt.xlabel('n_unique_tokens', fontsize=14)
plt.ylabel('shares', fontsize=14)

# Display the plot
plt.show()

In [None]:
# 'kw_avg_max' 
# Create a scatterplot for 'kw_avg_max' vs 'shares'
plt.figure(figsize=(10, 6))

palette = {True: "mediumorchid", False: "cornflowerblue"}

# Create the scatterplot
sns.scatterplot(data=df, x=' kw_avg_max', y=' shares', hue=(df[' shares'] > threshold), palette=palette)

# Add a horizontal line representing the top 5% viral news threshold
plt.axhline(y=threshold, color='crimson', linestyle='--', label='Top 5% viral news threshold')

# Customize the plot appearance
plt.legend()
plt.title('Scatterplot of kw_avg_max vs shares', fontsize=16)
plt.xlabel('kw_avg_max', fontsize=14)
plt.ylabel('shares', fontsize=14)

# Display the plot
plt.show()

# Scaling of Dataset

In [None]:
# We have sperated continuous and descrete features. 
continuous_features

In [None]:
discrete_features

In [None]:
# As we have only list of continuous_features and discrete_features. Therefore it is necessary to convert into dataframe to perform future analysis
continuous_df = df[continuous_features]
discrete_df = df[discrete_features]

In [None]:
continuous_df.head()

In [None]:
# As timedelta is only the time difference between the data collection and news publish. So it is not going to add value to the prediction
continuous_df = pd.DataFrame(continuous_df)
continuous_df = continuous_df.drop(' timedelta', axis =1)

In [None]:
# Need to drop the target variable
continuous_df = continuous_df.drop(' shares', axis =1)


In [None]:
# Need to check the features with negative values. (Skweness)

negcols= continuous_df.columns[(continuous_df<=0).any()]
negcols

As we got almost all of the numerical feature have negative value we have to covert into positive value to apply Box-Cox method to transform the features. (Applied other methods as well but not good)

In [None]:
for i in negcols:
    m=continuous_df[i].min()
    name=i +'_new'
    continuous_df[name]=((continuous_df[i]+1)-m)

In [None]:
continuous_df.columns

We got the new positive columns. Now dropping negative cols

In [None]:
# Droping old negative column

for i in negcols:
    continuous_df.drop(i,axis=1,inplace=True)


In [None]:
# Checking negative columns

negcols=continuous_df.columns[(continuous_df<=0).any()]
negcols

Finally we don't have any negative column

In [None]:
from sklearn import preprocessing
pt=preprocessing.PowerTransformer(method='box-cox',standardize=False)
df_num_add=pt.fit_transform(continuous_df)
df_num_add=(pd.DataFrame(continuous_df,columns=continuous_df.columns))


In [None]:
for col in df_num_add.columns:
    percentiles = df_num_add[col].quantile([0.01,0.99]).values
    df_num_add[col][df_num_add[col] <= percentiles[0]] = percentiles[0]
    df_num_add[col][df_num_add[col] >= percentiles[1]] = percentiles[1]


In [None]:
# Checking outliers again

num_cols = df_num_add.select_dtypes(['int64','float64']).columns

for column in num_cols:    
    q1 = df_num_add[column].quantile(0.25)    # First Quartile
    q3 = df_num_add[column].quantile(0.75)    # Third Quartile
    IQR = q3 - q1                            # Inter Quartile Range

    llimit = q1 - 1.5*IQR                       # Lower Limit
    ulimit = q3 + 1.5*IQR                        # Upper Limit

    outliers = df_num_add[(df_num_add[column] < llimit) | (df_num_add[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print(llimit)
    print(ulimit)
    print(IQR)


In [None]:
sns.set(style="whitegrid", font_scale=1.2)

# Define a custom color palette
palette = sns.color_palette("husl")

# Plotting boxplots for each continuous feature
for feature in df_num_add:
    plt.figure(figsize=(8, 5))  # This will create a new figure for each feature with custom size
    sns.boxplot(data=df_num_add, x=feature, color=palette[3], width=0.5)

    # Customize the plot appearance
    plt.title(f'Boxplot of {feature}', fontsize=16)
    plt.xlabel(feature, fontsize=14)

    # Display the plot
    plt.show()


In [None]:
df_num_add.columns

In [None]:
discrete_df.columns

In [None]:
my_final_df = pd.concat([df_num_add,discrete_df], axis =1 )

In [None]:
df.head()

In [None]:
df[' shares'].describe()

As we need to set the threshold that is the news is popular enough. To know that we are taking median of number of total shares on particular news article i.e 1400. If the shares go beyond the 1400 shares that means it is popular otherwise not. 

In [None]:
my_final_df['popularity'] = df[' shares'].apply(lambda x: 0 if x <1400 else 1)
my_final_df['shares'] = df[' shares']

In [None]:
my_final_df.head()

As we need to normalize the data before performing t-test to check statistically that their is huge difference between top 5% and bottom 95% mean.


In [None]:
import numpy as np
from scipy.stats import ttest_ind

# Calculate the 95th percentile value for shares
shares_95th_percentile = np.percentile(my_final_df['shares'], 95)

# Create two groups: top 5% shares and the rest
top_5_percent_shares = my_final_df[my_final_df['shares'] > shares_95th_percentile]
remaining_shares = my_final_df[my_final_df['shares'] <= shares_95th_percentile]

# Perform t-test for a specific feature (e.g., n_tokens_title)
t_stat, p_value = ttest_ind(top_5_percent_shares[' n_tokens_title'], remaining_shares[' n_tokens_title'])

print("t-statistic:", t_stat)
print("p-value:", p_value)


In [None]:
# t-test
t_stat, p_value = ttest_ind(top_5_percent_shares[' num_imgs_new'], remaining_shares[' num_imgs_new'])
print("t-statistic:", t_stat)
print("p-value:", p_value)


In [None]:
my_final_df[' num_imgs_new'].describe()

In [None]:
total_outliers_img = my_final_df[my_final_df[' num_imgs_new'] > 5]
print("total outliers in num_imgs_new: ", len(total_outliers_img))

In [None]:
# t-test
t_stat, p_value = ttest_ind(top_5_percent_shares[' kw_min_min_new'], remaining_shares[' kw_min_min_new'])
print("t-statistic:", t_stat)
print("p-value:", p_value)


In [None]:
# t-test
t_stat, p_value = ttest_ind(top_5_percent_shares[' kw_max_min_new'], remaining_shares[' kw_max_min_new'])
print("t-statistic:", t_stat)
print("p-value:", p_value)

In [None]:
# t-test
t_stat, p_value = ttest_ind(top_5_percent_shares[' title_sentiment_polarity_new'], remaining_shares[' title_sentiment_polarity_new'])
print("t-statistic:", t_stat)
print("p-value:", p_value)

Yes we can say that we have outliers in some of the features. So to reduce the effect of it we will transform our data to minimize the effect of outliers. And we will do PCA also to reduce dimentionality.

In [None]:
# Renaming the features to get better understanding
my_final_df.columns = my_final_df.columns.str.replace(' ', '')
my_final_df.rename(columns = {" n_tokens_title": "n_tokens_title", " average_token_length" :	"average_token_length", 
                              " num_keywords" :"num_keywords",	"n_tokens_content_new": "n_tokens_content",	"n_unique_tokens_new" : "n_unique_tokens",
                              "n_non_stop_words_new": "n_non_stop_words" ,	"n_non_stop_unique_tokens_new" : "n_non_stop_unique_tokens",	"num_hrefs_new" : "num_hrefs",
                              "num_self_hrefs_new" : "num_self_hrefs"	,"num_imgs_new": "num_imgs",	"num_videos_new" :"num_videos",	"kw_min_min_new": "kw_min_min",	
                              "kw_max_min_new" :	"kw_max_min", "kw_avg_min_new": "kw_avg_min",	"self_reference_min_shares_new" : "self_reference_min_shares",	"self_reference_max_shares_new": "self_reference_max_shares",
                              "self_reference_avg_sharess_new": "self_reference_avg_sharess",	"LDA_00_new":"LDA_00", 	"LDA_01_new": "LDA_01",	"LDA_02_new": "LDA_02",	
                              "LDA_03_new": "LDA_03",	"LDA_04_new": "LDA_04",	"global_subjectivity_new": "global_subjectivity",	"global_sentiment_polarity_new": "global_sentiment_polarity",
                              "global_rate_positive_words_new": "global_rate_positive_words",	"global_rate_negative_words_new" : "global_rate_negative_words", 	
                              "rate_positive_words_new": "rate_positive_words",	"rate_negative_words_new" :"rate_negative_words",	"avg_positive_polarity_new":"avg_positive_polarity",
                              "min_positive_polarity_new": "min_positive_polarity", "max_positive_polarity_new": "max_positive_polarity",	"avg_negative_polarity_new": "avg_negative_polarity",
                              "min_negative_polarity_new": "min_negative_polarity",	"max_negative_polarity_new" : "max_negative_polarity",	"title_subjectivity_new" : "title_subjectivity",
                              "title_sentiment_polarity_new": "title_sentiment_polarity", "abs_title_subjectivity_new" : "abs_title_subjectivity",	"abs_title_sentiment_polarity_new" : "abs_title_sentiment_polarity",
                              'kw_min_min' : 'worstkw_min', 'kw_max_min' : 'worstkw_max', 'kw_avg_min' : 'worstkw_avg',                                          
                          'kw_min_max_new' : 'bestkw_min', 'kw_max_max_new' : 'bestkw_max', 'kw_avg_max_new' : 'bestkw_avg', 
                          'kw_min_avg_new' : 'avgkw_min', 'kw_max_avg_new' : 'avgkw_max', 'kw_avg_avg_new' : 'avgkw_avg'}, inplace = True)

Finally we have changed the variables names. Now data is prepared for further analysis 

In [None]:
my_final_df.to_csv('Online_news_popularity_final_cleaned.csv', index=False)

In [None]:
files.download('Online_news_popularity_final_cleaned.csv')