In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
Fake_News = pd.read_csv("https://raw.githubusercontent.com/ba18406/Lab001/master/fake_news.csv")

In [10]:
#Encoding categorical variables in dataSet
encoder1 = LabelEncoder()
Fake_News['Usage'] = encoder1.fit_transform(Fake_News['Usage'])

encoder2 = LabelEncoder()
Fake_News['Expected'] = encoder2.fit_transform(Fake_News['Expected'])

In [4]:
Usage_data = Counter(Fake_News['Usage'])
print("\nDataSet 3\nImbalance: ", (Usage_data[0]/(Usage_data[0]+Usage_data[1]))*100, "%")
print(Usage_data)


DataSet 3
Imbalance:  85.73995379745031 %
Counter({0: 120252, 1: 20000})


In [11]:
#Splitting the dependant and independant variables
X = Fake_News.iloc[:, 0:1]
Y = Fake_News.iloc[:, -1]

In [6]:
#Finding the best depth for tree by performing a grid search.
#Additionaly performing 10 Fold cross validation of each classifier 
from sklearn import tree
from sklearn.model_selection import cross_val_score
depth = []
for i in range(3,20):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    # Perform 10-fold cross validation 
    scores = cross_val_score(estimator=clf, X=X, y=Y, cv=10, n_jobs=4)
    depth.append((i,scores.mean()*100))
print(depth)
#Hence we can achieve around 85% accuracy using a decision tree if we set max depth of the tree from 5-13

[(3, 74.27980808750137), (4, 74.28979005884922), (5, 74.28479917484539), (6, 74.28194712493452), (7, 74.29406843872583), (8, 74.30333810928673), (9, 74.32829364767697), (10, 74.4152810682885), (11, 74.50939957954327), (12, 74.77677895367731), (13, 74.93292695790579), (14, 74.94790103329873), (15, 74.99923681332326), (16, 74.96643778183272), (17, 75.03845295711326), (18, 75.11973424450085), (19, 75.13114478255687)]


In [34]:
#Training a Random Forest Classifier.
#Also performing 10 Fold Cross Validation.
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(n_estimators=100)
randomforest.fit(X, Y)
RandomForest_CV_Score = cross_val_score(randomforest, X, Y, cv=10)
print(RandomForest_CV_Score)
print("Mean Score: ", RandomForest_CV_Score.mean())
#Hence around 85% accuracy

[0.3259703  0.16248596 0.1651067  0.59628104 0.78971671 0.79208786
 0.7959311  0.79306041 0.7978033  0.79518223]
('Mean Score: ', 0.6013625605520336)


In [12]:
#Spliting the data into 10 equal portions using stratified K Fold. 
#Using 1 portion for testing and 9 for training.
from sklearn.model_selection import StratifiedKFold
bins = StratifiedKFold(n_splits = 10)
for train_index, test_index in bins.split(X,Y):
    #print(test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    training_data, testing_data = Fake_News.iloc[train_index], Fake_News.iloc[test_index]

In [None]:
#Now finding the ideal number of clusters.
#Using Elbow Method

#mms = MinMaxScaler()
#data_transformed= mms.fit_transform(X_train)

inertiaS = []
for k in range(2,11):
    km = KMeans(n_clusters = k)
    #print(km.inertia_)
    cluster_labels = km.fit_predict(X_train)
    inertiaS.append(km.inertia_)
    avg_score = silhouette_score(X_train, cluster_labels)
    print("For n_clusters =", k, "The average silhouette_score is :", avg_score)
    
plt.plot(range(2, 11), inertiaS)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

#Hence according to Elbow method and silhoutte analysis, the ideal number of clusters should be 2.

In [13]:
#Running K-means on the selected number of clusters, i.e: 2
km = KMeans(n_clusters = 2, init='k-means++', max_iter=300, n_init=10, random_state=0)
km.fit(training_data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [14]:
cluster_labels = km.fit_predict(X_train)
cluster_labels

array([0, 0, 0, ..., 1, 1, 1])

In [15]:
for a,b,c in zip(X_train, cluster_labels , Y_train):
    print(a, "cluster = ", b, "labels =", c)

Id cluster =  0 labels = 1


In [16]:
#storing cluster number and respective centroids with number of samples
label0 = np.array([]) # cluster 0 with minority class
label1 = np.array([])# cluster 1 with minority class
count0 = 0
count1 = 0
numOfLabels0 = 0
numOfLabels1 = 0
for cluster,centroid,labels in zip(km.labels_,X_train,Y_train):
    if(cluster==0 and labels == 1):
        count0 = count0 + 1
        label0 = np.append(label0 ,[centroid,"cluster",cluster,'labels',labels])
        numOfLabels0 = count0
    elif(cluster==1 and labels == 1):
        count1 = count1 + 1
        label1 = np.append(label1 ,[centroid,"cluster",cluster,'labels',labels])
        numOfLabels1 = count1
        
print('total num minority class samples in cluster(0):',numOfLabels0)
print('total num minority class samples in cluster(1):',numOfLabels1)

#centroid of cluster-1 
#print("Label 0:",label0)
#centroid of cluster-2
#print("label 1:",label1)

total num minority class samples in cluster(0): 1
total num minority class samples in cluster(1): 0
