In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.externals import joblib
from sklearn.svm import SVC

In [2]:
data = pd.read_csv("aman_ml_authors_20.csv")

In [3]:
print data.head()

   author_id  adj  adp  adv  aux  cconj  det  intj  noun  num  part  pon  \
0         80  118  240   50   30      0  193     0   425   18    24  100   
1         80  107  206   69   42      0  130     0   400   11    33  155   
2         80  126  236   62   18      0  187     0   432    8    13  125   
3         80  119  242   47   29      0  209     0   447   15    19   80   
4         80   90  200   60   32      0  180     0   427   16    21  122   

   propn  punct  sconj  sym  verb  
0     60      0     17    0   179  
1     71      0     32    0   197  
2     62      0     12    0   178  
3     71      0      8    0   158  
4     75      0     24    0   204  


In [4]:
print data.shape 

(7612, 17)


In [5]:
size = []
authorList = data.author_id.unique()
for auth in authorList:
    current = data.loc[data['author_id'] == auth]
    size.append(current.shape[0])
    print("Author: %5s  Size: %5s" % (auth, current.shape[0]))

Author:    80  Size:   402
Author:  1087  Size:  1555
Author:  1090  Size:   121
Author:  1430  Size:   118
Author:  1458  Size:   122
Author:  1579  Size:   117
Author:  2311  Size:   515
Author:  3015  Size:   116
Author:  3263  Size:   155
Author:  3928  Size:    45
Author:  4310  Size:    50
Author:  4434  Size:   233
Author:  5248  Size:  2623
Author:  6538  Size:   163
Author:  7652  Size:   365
Author:  7742  Size:   207
Author:  8467  Size:   132
Author: 10454  Size:   204
Author: 10588  Size:    86
Author: 10813  Size:   283


In [6]:
y = data.author_id #classification label
X = data.drop('author_id', axis = 1) 
#X = data.drop('doc_id', axis = 1) #not required for this problem


In [7]:
print(X.dtypes)

adj      int64
adp      int64
adv      int64
aux      int64
cconj    int64
det      int64
intj     int64
noun     int64
num      int64
part     int64
pon      int64
propn    int64
punct    int64
sconj    int64
sym      int64
verb     int64
dtype: object


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
 test_size=0.3,
random_state=123, stratify = y)

In [None]:
X_train_scaled = preprocessing.scale(X_train)
print X_train_scaled

In [None]:
print X_train_scaled.mean(axis=0)

In [None]:
print X_train_scaled.std(axis=0)

In [None]:
# Pre-processing 
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
print X_train_scaled.mean(axis=0)
# [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
print X_train_scaled.std(axis=0)
# [ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

In [None]:
X_test_scaled = scaler.transform(X_test)

print X_test_scaled.mean(axis=0)

print X_test_scaled.std(axis=0)


In [None]:
## SVC
clf = SVC(kernel='linear', verbose= True, C= 1)
clf.fit(X_train,y_train)


In [None]:
y_pred = clf.predict(X_test)
print 'R2_score'
print r2_score(y_test, y_pred)
print 'Accuracy'
print clf.score(X_train, y_train)

print 'Test Accuracy'
print clf.score(X_test, y_test)

In [None]:
### Not related

%matplotlib inline
from string import letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")

# Generate a large random dataset
rs = np.random.RandomState(33)
df = pd.DataFrame(data=X)
# Compute the correlation matrix

corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
 square=True,
 linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)


In [None]:
# To figure out the distinct authors

output = []
for a in y:
    if a not in output:
        output.append(a)
print output

In [None]:
# Visualization using TSNE 

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib import *
import itertools    
import matplotlib.cm as cm
colors = itertools.cycle(["r", "b", "g"])

X_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X)

fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
X_embedded = rp.fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
X_embedded = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
X_embedded = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)

lle= manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='standard')
X_embedded = lle.fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
mds = manifold.MDS(n_components=2, n_init=1, max_iter=100)
X_embedded = mds.fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")

In [None]:
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                      eigen_solver="arpack")
X_embedded = embedder.fit_transform(X)
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=y, marker="x",cmap="jet")