In [None]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
data.target_names = ['alt.atheism','comp.graphics', 'comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware','comp.windows.x','misc.forsale','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','sci.crypt', 'sci.electronics','sci.med','sci.space','soc.religion.christian','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']

In [None]:
categories = ['talk.religion.misc', 'soc.religion.christian',
              'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
print(train.data[5])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
model.fit(train.data, train.target)
labels = model.predict(test.data)

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
def predict_category(s, train=train, model=model):
    pred = model.predict([s])
    return train.target_names[pred[0]]

In [None]:
predict_category('sending a payload to the ISS')

In [None]:
predict_category('discussing islam vs atheism')

In [None]:
predict_category('determining the screen resolution')

In [None]:
import pandas as pd
wine = pd.read_csv("Wine.csv", header=None)
wine

In [None]:
wine.columns=["class","alc","malic","ash","alc_of_ash","mag","to_phe","fla","nonfla","pro","col","hue","dilu","proline"]
wine.head()

In [None]:
wine.isnull().sum()

In [None]:
sns.pairplot

In [None]:
for col in wine.columns[1:]:
    sns.distplot(wine[col])
    plt.title(col)
    plt.show()

In [None]:
sns.distplot(np.log(wine["malic"]))

In [None]:
wine["malic"]=np.log(wine.loc[:,"malic"])

In [None]:
sns.distplot(np.log(wine["nonfla"]))

In [None]:
wine["nonfla"]=np.log(wine.loc[:,"nonfla"])

In [None]:
sns.distplot(np.log(wine["col"]))

In [None]:
wine["col"]=np.log(wine.loc[:,"col"])

In [None]:
sns.distplot(np.log(wine["proline"]))

In [None]:
wine["proline"]=np.log(wine.loc[:,"proline"])

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(wine.corr(),annot=True)

In [None]:
sns.pairplot(wine[["fla","to_phe","pro","dilu"]])
plt.show()

In [None]:
wine.drop(["to_phe","pro","dilu"], axis='columns', inplace=True)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(wine.corr(),annot=True)

In [None]:
(wine.corr()>0.7).sum()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(wine.iloc[:,1:], wine["class"], test_size=0.3, random_state=421)

In [None]:
print(x_train.shape);print(y_train.shape);print(x_test.shape);print(y_test.shape)

In [None]:
def mse(model, X, y):
    return np.mean((y - model.predict(X)) ** 2)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_model = LinearDiscriminantAnalysis().fit(x_train,y_train)
lda_tr_mse = mse(lda_model, x_train, y_train)
lda_te_mse = mse(lda_model, x_test, y_test)
print("LDA train rmse : ", lda_tr_mse)
print("LDA test rmse : ", lda_te_mse)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_model = QuadraticDiscriminantAnalysis().fit(x_train,y_train)
qda_tr_mse = mse(qda_model, x_train, y_train)
qda_te_mse = mse(qda_model, x_test, y_test)
print("QDA train rmse : ", qda_tr_mse)
print("QDA test rmse : ", qda_te_mse)

In [None]:
from sklearn.linear_model import LogisticRegression
logi_model = LogisticRegression(solver='sag', max_iter=100000, multi_class="auto")
logi_model.fit(x_train, y_train)
logi_tr_mse = mse(logi_model, x_train, y_train)
logi_te_mse = mse(logi_model, x_test, y_test)
print("Logistic Regression train rmse : ", logi_tr_mse)
print("Logistic Regression test rmse : ", logi_te_mse)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

for N in range(1, 4):
    poly = PolynomialFeatures(degree=N, include_bias=False)
    x_train_poly = poly.fit_transform(x_train)
    logi_model.fit(x_train_poly, y_train)
    cv_error = np.mean(cross_val_score(logi_model, x_train_poly, y_train, scoring=mse, cv=5))
    
    print("MSE of degree" , N , " : ", cv_error)

In [None]:
print("LDA Test MSE : ", lda_te_mse)
print("QDA Test MSE : ", qda_te_mse)
print("Logistic Regression Test MSE : ", logi_te_mse)