In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [None]:
data = pd.read_csv("/kaggle/input/uci-online-news-popularity-data-set/OnlineNewsPopularity.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

# Exploratory Data Analysis

In [None]:
data.describe()

In [None]:
data = data.drop_duplicates()
print(data.shape)
'''The resultant shape implies that there aren't
any duplicates'''

In [None]:
n = data.isna().sum()
n[n > 0]
#the output implies that there aren't any duplicates

In [None]:
data.hist(figsize = (20,20))
plt.show()

In [None]:
cor = data.corr()
plt.figure(figsize = (20,20))
data1 = cor.where(np.tril(np.ones(cor.shape).astype(bool)))
sns.heatmap(data1, cmap = 'Reds')

In [None]:
#Eliminating white space Character from the feature names
data.columns = data.columns.str.replace(" ","")

In [None]:
'''n_tokens_content - Number of words in the content
However if its minimum value is 0 it means that there are 
articles that do not have any content.
Such records should be dropped as their related attributes
pose no meaning to our analysis
find number of rows that contain 0 for n_tokens_content'''

num_of_no_words = data[data['n_tokens_content'] == 0].index
print("The number of news articles/items without words",num_of_no_words.size)


In [None]:
#dropping rows of articles that have zero words in the conten
data = data[data['n_tokens_content'] != 0]

In [None]:
'''Since URL is a non-numeric attribute and will not add value
to our analysis so we drop it from the dataset.
Also timedelta is a non-predictive attribute and not a feature
of the data set so we can drop it from the dataset.
Drop highly correlated attributes "n_non_stop_unique_tokens",
"n_non_stop_words","kw_avg_min".'''
data = data.drop('url',axis=1)
data = data.drop('timedelta',axis=1)
data= data.drop(["n_non_stop_unique_tokens","n_non_stop_words","kw_avg_min"],axis=1)

In [None]:
data['shares'].describe()

In [None]:
a = data['shares'].mean()
b = data['shares'].median()
print("Mean shares: ",a)
print("Median shares: ",b)

In [None]:
#a new target variable
data['popularity'] = data['shares'].apply(lambda x: 0 if x <1400 else 1)
data.hist(column = 'popularity');

In [None]:
# n_tokens_content & shares
plt.figure(figsize = (10,5))
ax = sns.scatterplot(y = 'shares', x = 'n_tokens_content', data = data)

#n_tokens_title & shares
plt.figure(figsize = (10,5))
ax = sns.scatterplot(y = 'shares', x = 'n_tokens_title', data = data ,palette = 'muted');

In [None]:
days_of_week = data.columns[26:33]
print(days_of_week)

Unpopular  =data[data['shares'] < a]
Popular = data[data['shares'] >= a]
Unpopular_day = Unpopular[days_of_week].sum().values
Popular_day = Popular[days_of_week].sum().values

fig = plt.figure(figsize = (13,5))
plt.title("The mean count of popular/unpopular news over different days of week", fontsize = 16)

plt.bar(np.arange(len(days_of_week)),Popular_day,width = 0.3, align = 'center', color = 'tab:orange', label = 'Popular')
plt.bar(np.arange(len(days_of_week)) - 0.3,Unpopular_day,width = 0.3, align = 'center', color = 'g', label = 'Unpopular')

plt.xticks(np.arange(len(days_of_week)),days_of_week)
plt.ylabel('COUNT',fontsize = 15)
plt.xlabel('Days of Week',fontsize = 17)

plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()

#*******************************************************************************************************

Unpopular = data[data['shares'] < b]
Popular = data[data['shares'] >= b]
Unpopular_day = Unpopular[days_of_week].sum().values
Popular_day = Popular[days_of_week].sum().values

fig = plt.figure(figsize = (13,5))
plt.title("The median Count of popular/unpopular news over different day of week", fontsize = 16)

plt.bar(np.arange(len(days_of_week)),Popular_day,width = 0.3, align = 'center', color = 'tab:orange', label = 'Popular')
plt.bar(np.arange(len(days_of_week)) - 0.3,Unpopular_day,width = 0.3,align = 'center',color = 'g', label = 'Unpopular')

plt.xticks(np.arange(len(days_of_week)),days_of_week)
plt.ylabel('COUNT',fontsize = 15)
plt.xlabel('Days of Week',fontsize = 17)

plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()

In [None]:
Data_channel = data.columns[9:15]
print(Data_channel)

Unpopular = data[data['shares'] < a]
Popular = data[data['shares'] >= a]
Unpopular_day = Unpopular[Data_channel].sum().values
Popular_day = Popular[Data_channel].sum().values
fig = plt.figure(figsize = (13,5))
plt.title("The mean count of popular/unpopular news over different data channel", fontsize = 16)
plt.bar(np.arange(len(Data_channel)), Popular_day, width = 0.3, align = "center", color = 'tab:orange', \
          label = "popular")
plt.bar(np.arange(len(Data_channel)) - 0.3, Unpopular_day, width = 0.3, align = "center", color = 'g', \
          label = "unpopular")
plt.xticks(np.arange(len(Data_channel)), Data_channel)
plt.ylabel("Count", fontsize = 12)
plt.xlabel("Article category", fontsize = 12)
    
plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()


#********************************************************************************************************

Unpopular = data[data['shares'] < b]
Popular = data[data['shares'] >= b]
Unpopular_day = Unpopular[Data_channel].sum().values
Popular_day = Popular[Data_channel].sum().values
fig = plt.figure(figsize = (13,5))
plt.title("Count of popular/unpopular news over different data channel (Median)", fontsize = 16)
plt.bar(np.arange(len(Data_channel)), Popular_day, width = 0.3, align = "center", color = 'tab:orange', \
          label = "popular")
plt.bar(np.arange(len(Data_channel)) - 0.3, Unpopular_day, width = 0.3, align = "center", color = 'g', \
          label = "unpopular")
plt.xticks(np.arange(len(Data_channel)), Data_channel)
plt.ylabel("Count", fontsize = 12)
plt.xlabel("Article category", fontsize = 12)
    
plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()

In [None]:
for column in data:
    plt.figure(figsize = (15,15))
    sns.boxplot(data = data, x = column)

In [None]:
#Checking for outliers and how many of them are present
num_cols = data.select_dtypes(['int64','float64']).columns
for column in num_cols:    
    q1 = data[column].quantile(0.25)    # First Quartile
    q3 = data[column].quantile(0.75)    # Third Quartile
    IQR = q3 - q1                       # Inter Quartile Range

    llimit = q1 - 1.5*IQR               # Lower Limit
    ulimit = q3 + 1.5*IQR               # Upper Limit

    outliers = data[(data[column] < llimit) | (data[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print( llimit)
    print( ulimit)
    print( IQR)

# Scaling

In [None]:
# df_num  dataframe contains numerical feaures.

df_num = data.drop(["weekday_is_monday","weekday_is_tuesday","weekday_is_wednesday","weekday_is_thursday",
                  "weekday_is_friday","weekday_is_saturday","weekday_is_sunday","is_weekend",                  
                  "data_channel_is_lifestyle","data_channel_is_entertainment","data_channel_is_bus",
                  "data_channel_is_socmed","data_channel_is_tech","data_channel_is_world"],axis = 1)

# df_cat dataframe contains catagorical features.

df_cat = data[["weekday_is_monday","weekday_is_tuesday","weekday_is_wednesday","weekday_is_thursday",
             "weekday_is_friday","weekday_is_saturday","weekday_is_sunday","is_weekend",            
             "data_channel_is_lifestyle","data_channel_is_entertainment","data_channel_is_bus",
                  "data_channel_is_socmed","data_channel_is_tech","data_channel_is_world"]]

# Checking distribution of attributes to decide the method of scaling
# Drop 'shares' from df_num
df_num = df_num.drop('shares', axis= 1)

df_num.columns

In [None]:
#Searching for negative values in columns, if any.
negative_colns = df_num.columns[(df_num <= 0).any()]
print(negative_colns)

In [None]:
#Converting the negative values into positive so that
#Box - Cox method can be applied
for i in negative_colns:
    m=df_num[i].min()
    name=i +'_new'
    df_num[name]=((df_num[i]+1)-m)
    
df_num.columns

In [None]:
# Droping the previous negative columns
for i in negative_colns:
    df_num.drop(i,axis=1,inplace=True)

negative_colns=df_num.columns[(df_num<=0).any()]
print(negative_colns)

In [None]:
#HANDLING THE OUTLIERS
      
pt = preprocessing.PowerTransformer(method = 'box-cox',standardize = False)
df_num_add = pt.fit_transform(df_num)
df_num_add = (pd.DataFrame(df_num_add,columns = df_num.columns))

for col in df_num_add.columns:
    percentiles = df_num_add[col].quantile([0.01,0.99]).values
    df_num_add[col][df_num_add[col] <= percentiles[0]] = percentiles[0]
    df_num_add[col][df_num_add[col] >= percentiles[1]] = percentiles[1]


# Checking for the outliers again
num_cols = df_num_add.select_dtypes(['int64','float64']).columns

for column in num_cols:    
    q1 = df_num_add[column].quantile(0.25)   # First Quartile (Q1)
    q3 = df_num_add[column].quantile(0.75)   # Third Quartile (Q3)
    IQR = q3 - q1                            # Inter Quartile Range (IQR)

    llimit = q1 - 1.5*IQR                    # Lower Limit
    ulimit = q3 + 1.5*IQR                    # Upper Limit

    outliers = df_num_add[(df_num_add[column] < llimit) | (df_num_add[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print( llimit) #Lower limit
    print(ulimit) #Upper limit
    print( IQR) #Inter Quartile Range

In [None]:
import matplotlib.pyplot as plt

num_cols = df_num_add.select_dtypes(['int64', 'float64']).columns

for col in num_cols:
    sns.boxplot(x=col, data=df_num_add)
    plt.show()


In [None]:
print(df_num_add.columns)
print(df_cat.columns)

In [None]:
df_final = pd.concat([df_num_add,df_cat], axis = 1)
df_final.shape

In [None]:
df_final['popularity'] = data['shares'].apply(lambda x: 0 if x <1400 else 1)
df_final.isnull().sum()

In [None]:
df_final=df_final.dropna()

In [None]:
df_final.columns

In [None]:
df_final.shape

# Classification Models to be used
**1. AdaBoost CLassifier**    
**2. Logisitic Regression**    
**3. Random Forest**    
**4. GaussianNB**    
**5. SVC**  
**6. KNeighborsClassifier**    
**7. Decision Tree**

In [None]:
modelscore=[]
X=df_final.drop(['popularity','popularity_new'],axis=1)
y=df_final['popularity']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



In [None]:
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 

# learner: the learning algorithm to be trained and predicted on
# sample_size: the size of samples (number) to be drawn from training set
# X_train: features training set
# y_train: income training set
# X_test: features testing set
# y_test: income testing set
    
    results = {}
    
    start = time() # start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # end time

    results['train_time'] = end-start
        
    # Get predictions on the first 4000 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:4000])
    end = time() # Get end time
    
    # he total prediction time
    results['pred_time'] = end-start
    # Compute accuracy on the first 4000 training samples
    results['acc_train'] = accuracy_score(y_train[:4000],predictions_train)     
    # Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    # Compute F-score on the the first 4000 training samples
    results['f_train'] = fbeta_score(y_train[:4000],predictions_train,beta=1)   
    # Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test,predictions_test,beta=1)
    # Compute AUC on the the first 4000 training samples
    results['auc_train'] = roc_auc_score(y_train[:4000],predictions_train)  
    # Compute AUC on the test set
    results['auc_test'] = roc_auc_score(y_test,predictions_test)
       
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    print ("{} with accuracy {}, F1 {} and AUC {}.".format(learner.__class__.__name__,\
          results['acc_test'],results['f_test'], results['auc_test']) )


    #plt.show()
    return results

In [None]:
import matplotlib.patches as mpatches
def evaluate(results,name):
    
# learners: a list of supervised learners
# stats: a list of dictionaries of the statistic results from 'train_predict()'
# accuracy: The score for the naive predictor
# f1: The score for the naive predictor
  
    # Create figure
    fig, ax = plt.subplots(2, 4, figsize = (16,7))

    bar_width = 0.3
    colors = ['#A00000','#00A0A0','#00A000']
    
    
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'auc_train','pred_time', 'acc_test',\
                                    'f_test', 'auc_test']):
            for i in np.arange(3):
                
                # plot code
                ax[j//4, j%4].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
                ax[j//4, j%4].set_xticks([0.45, 1.45, 2.45])
                ax[j//4, j%4].set_xticklabels(["1%", "10%", "100%"])
                ax[j//4, j%4].set_xlim((-0.1, 3.0))
    
    # Add labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("Accuracy Score")
    ax[0, 2].set_ylabel("F-score")
    ax[0, 3].set_ylabel("AUC")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("Accuracy Score")
    ax[1, 2].set_ylabel("F-score")
    ax[1, 3].set_ylabel("AUC")
    ax[1, 0].set_xlabel("Training Set Size")
    ax[1, 1].set_xlabel("Training Set Size")
    ax[1, 2].set_xlabel("Training Set Size")
    ax[1, 3].set_xlabel("Training Set Size")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("Accuracy Score on Training Subset")
    ax[0, 2].set_title("F-score on Training Subset")
    ax[0, 3].set_title("AUC on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("Accuracy Score on Testing Set")
    ax[1, 2].set_title("F-score on Testing Set")
    ax[1, 3].set_title("AUC on Testing Subset")
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[0, 3].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))
    ax[1, 3].set_ylim((0, 1))

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    plt.legend(handles = patches,  bbox_to_anchor = (-1.4, 2.54),\
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
    
    plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    plt.savefig(name)
    plt.tight_layout()
    plt.show()

# AdaBoost CLassifier, Logisitic Regression, Random Forest

In [None]:
# Import the three supervised learning models from sklearn
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier
from time import time
from IPython.display import display
from sklearn.metrics import accuracy_score, fbeta_score, roc_curve, auc, roc_auc_score
# Initialize the three models
clf_A = AdaBoostClassifier(random_state=0)
clf_B = LogisticRegression(random_state=0,C=1.0)
clf_C = RandomForestClassifier(random_state=0)

# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_1 = int(X_train.shape[0]*0.01)
samples_10 = int(X_train.shape[0]*0.1)
samples_100 = X_train.shape[0]

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        if clf == clf_A:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)
        elif clf == clf_B:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)
        else:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)

# Run metrics visualization for the three supervised learning models chosen
evaluate(results,'perf_unopt.pdf')

# GaussianNB, Support Vector Classifier(SVC), KNeighborsClassifier

In [None]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
clf_A =GaussianNB()
clf_B = SVC(random_state=0,C=1.0)
clf_C = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)

# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_1 = int(X_train.shape[0]*0.01)
samples_10 = int(X_train.shape[0]*0.1)
samples_100 = X_train.shape[0]

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        if clf == clf_A:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)
        elif clf == clf_B:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)
        else:
            results[clf_name][i] = \
            train_predict(clf, samples, X_train, y_train, X_test, y_test)

# Run metrics visualization for the three supervised learning models chosen
evaluate(results,'perf_unopt1.pdf')

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)


# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_1 = int(X_train.shape[0] * 0.01)
samples_10 = int(X_train.shape[0] * 0.1)
samples_100 = X_train.shape[0]

# Collect results on the learner
results = {}
clf_name = clf.__class__.__name__
results[clf_name] = {}

for i, samples in enumerate([samples_1, samples_10, samples_100]):
    results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)
    

# Run metrics visualization for the supervised learning model
evaluate(results, 'perf_unopt1.pdf')

In [None]:
# Feature selection 
# USING BACKWARD ELIMINATION
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

cols = df_final.columns
model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=57)

# Transforming data using RFE
X_rfe = rfe.fit_transform(X, y)

# Fitting the data to the model
model.fit(X_rfe, y)

print("Selected Features:")
selected_features = [col for col, support in zip(cols, rfe.support_) if support]
print(selected_features)
#print(rfe.support_)
print("Feature Rankings:")
print(rfe.ranking_)

In [None]:
#Best model till now :Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred_train=rf.predict(x_train)
y_prob_train=rf.predict_proba(x_train)[:,1]

y_pred=rf.predict(x_test)
y_prob=rf.predict_proba(x_test)[:,1]  #used to find AUC of train and test

from sklearn.metrics import accuracy_score,roc_curve,roc_auc_score, classification_report

#print('Accuracy of Random forest train :',accuracy_score(y_pred_train,y_train))
#print('Accuracy of Random forest test:',accuracy_score(y_pred,y_test))
#print('AUC of Random forest train :',roc_auc_score(y_train,y_prob_train))
#print('AUC of Random forest test :',roc_auc_score(y_test,y_prob))

# Generate and print the classification report
classification_report_train = classification_report(y_train, y_pred_train)
classification_report_test = classification_report(y_test, y_pred)
print('Classification Report - Training Set:')
print(classification_report_train)
print('Classification Report - Testing Set:')
print(classification_report_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.xticks([0.5, 1.5], ["Unpopular", "Unpopular"])
    plt.yticks([0.5, 1.5], ["Unpopular", "Unpopular"])
    plt.show()
plot_confusion_matrix(y_test, y_pred)