In [189]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import math
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords 
from textblob import TextBlob
%matplotlib inline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix

# Q5. K-means Clustering 

In [155]:
#Q5.1,2

all_data = fetch_20newsgroups(subset='all') #11314 Data Points

def stemmer(row):
    tokens = TextBlob(row.lower()).words
    words = [token.stem() for token in tokens]
    return words


data = random.choices(all_data.data, k=300)

# Building the tf-idf vectorizer
vectorizer = TfidfVectorizer(tokenizer=stemmer, stop_words='english')
matrix = vectorizer.fit_transform(data)

#terms will be used later in printing the most frequent items
terms = vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [156]:
#Q5.3
k=5
model = KMeans(n_clusters=k,init='k-means++',max_iter=100)
model.fit(matrix)

KMeans(max_iter=100, n_clusters=5)

In [167]:
#Q5.4
# http://jonathansoma.com/lede/algorithms-2017/classes/clustering/k-means-clustering-with-scikit-learn/
print("Top representative terms per cluster:")
order_centroids = clusters.cluster_centers_.argsort()[:, ::-1]
for i in range(k): #for each cluster
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]] 
    print("Cluster {}: {}".format(i, ', '.join(top_ten_words)))

Top representative terms per cluster:
Cluster 0: sterl, tsakc, twisto.compaq.com, mppa3, hmm, there', non-human, paid-for, neuralgia, indian
Cluster 1: 'junk, legendari, oak.oakland.edu, spleen, turkey, vos.stratus.com, nc, treatment, atom, keysiz
Cluster 2: mpce.mq.edu.au, wierd, respons, 'net, feenix.metronet.com, mpce, cb650, foundat, 12mb, akin
Cluster 3: erixon, erh0362, 4488, indescrimin, erika, consol, amicu, nasa/jpl/caltech, eventu, tclock
Cluster 4: barn, bcso, streak, inexpenc, saltillo.cs.utexas.edu, 29848, order, gif, believ, info


In [170]:
#Q5.5
# The title is from https://www.npr.org/sections/coronavirus-live-updates/2020/12/02/941584691/virginia-county-votes-to-reject-gov-northams-coronavirus-restrictions

X = vectorizer.transform(["Virginia County Votes To Reject Gov. Northam's Coronavirus Restrictions"])

model.predict(X)[0]

4

# Q6. Regression Analysis

In [217]:
data = pd.read_excel('ENB2012_data.xlsx')
data = data.sort_values(by=['X1'],ascending=False)

In [218]:
nr = Normalizer(copy=False)
X = data.drop(['Y'], axis=1)
X = nr.fit_transform(X)
y = data[['Y']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 32)


In [259]:
from sklearn.model_selection import KFold ,cross_val_score
# Multivariate Linear Regression
clf = LinearRegression().fit(X_train, y_train)
folds = KFold(n_splits = 5, shuffle = True)
scores = cross_val_score(clf, X_train, y_train, scoring='r2', cv=5)
print("Using the cross validation on our Linear Regression Model the range of the scores our model can get are varying between",
      min(scores),"and",max(scores))   

y_pred = clf.predict(X_test)
modified_y_pred = []
for value in y_pred:
    if (value[0])<=0.5:
        modified_y_pred.append([0])
    else :
        modified_y_pred.append([1])

print("The confusion matrix for the Linear Regression Model is:")
print(confusion_matrix(y_test, modified_y_pred))

tn, fp, fn, tp = confusion_matrix(y_test, modified_y_pred).ravel()

print("The accuracy for the Linear Regression Model is:",(tn+tp)/(tn+fp+fn+tp)*100)

Using the cross validation on our Linear Regression Model the range of the scores our model can get are varying between 0.8833396083426609 and 0.9909928470046097
The confusion matrix for the Linear Regression Model is:
[[75  0]
 [ 6 73]]
The accuracy for the Linear Regression Model is: 96.1038961038961


In [256]:
# Logistic Regression

clf = LogisticRegressionCV(cv=5).fit(X_train, y_train)
folds = KFold(n_splits = 5, shuffle = True)
scores = cross_val_score(clf, X_train, y_train, scoring='r2', cv=5)



In [258]:
print("Using the cross validation on our Logistic Regression Model the range of the scores our model can get are varying between",
      min(scores),"and",max(scores))   

y_pred = clf.predict(X_test)

# In this confusion matrix Rows are the true classes
print("The confusion matrix for the Logistic Regression Model is:")
print(confusion_matrix(y_test, y_pred))

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("The accuracy for the Logistic Regression Model is:",(tn+tp)/(tn+fp+fn+tp)*100)


Using the cross validation on our Logistic Regression Model the range of the scores our model can get are varying between 0.8694960212201591 and 1.0
The confusion matrix for the Logistic Regression Model is:
[[75  0]
 [ 6 73]]
The accuracy for the Logistic Regression Model is: 96.1038961038961
