In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_table("data/WebKB/webkb-train-stemmed.txt",header=None)
df.columns=["Y","X"]
classes = {'project':0, 'faculty':1, 'course':2, 'student':3}
Y = np.array([classes[i0] for i0 in df["Y"]])
X = df["X"]

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X.values.astype('U')).toarray()
print(X.shape)

(2803, 7288)


In [4]:
r1 = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
scores = cross_val_score(r1, X, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.87 (+/- 0.03)


In [10]:
Xshort = np.load("webkb_selected_rfs.npy")
print(Xshort.shape)

(2803, 978)


In [6]:
r2 = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
scores = cross_val_score(r2, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.87 (+/- 0.02)


In [7]:
k1 = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
scores = cross_val_score(k1, X, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.73 (+/- 0.03)


In [8]:
k2 = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
scores = cross_val_score(k2, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.60 (+/- 0.02)


In [9]:
k1 = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
scores = cross_val_score(k1, X, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.72 (+/- 0.04)


In [10]:
k2 = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
scores = cross_val_score(k2, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.60 (+/- 0.02)


In [11]:
s1 = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None)
scores = cross_val_score(s1, X, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.22 (+/- 0.00)


In [12]:
s2 = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None)
scores = cross_val_score(s2, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.22 (+/- 0.00)


In [13]:
s1 = SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None)
scores = cross_val_score(s1, X, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.90 (+/- 0.03)


In [14]:
s2 = SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None)
scores = cross_val_score(s2, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.86 (+/- 0.04)


In [11]:
gnb = GaussianNB()
scores = cross_val_score(gnb, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.37 (+/- 0.02)


In [12]:
bnb = BernoulliNB()
scores = cross_val_score(bnb, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.72 (+/- 0.05)


In [13]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, Xshort, Y, cv=5, scoring='f1_weighted')
print("Fmeasure: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Fmeasure: 0.70 (+/- 0.04)
