In [1]:
import pandas as pd
import numpy as np

classes = ['Sydney', 'Melbourne', 'Brisbane', 'Perth']
train_raw = pd.read_csv('train-raw.tsv', sep='\t', names=['class', 'text']).reset_index(drop=True)
X_train_raw = train_raw['text']
y_train = train_raw['class']

dev_raw = pd.read_csv('dev-raw.tsv', sep='\t', names=['class', 'text']).reset_index()
X_dev_raw = dev_raw['text']
y_test = dev_raw['class']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectoriser = CountVectorizer()
X_train = vectoriser.fit_transform(X_train_raw)
X_test = vectoriser.transform(X_dev_raw)

In [3]:
#testing to see if it's working well
from sklearn.feature_selection import SelectKBest, chi2
x2 = SelectKBest(chi2, k=10)
X_train_x2 = x2.fit_transform(X_train,y_train)
X_test_x2 = x2.transform(X_test)

print(X_test_x2.shape, X_train_x2.shape)

(37300, 10) (103360, 10)


In [4]:
#testing chi-square
for feat_num in x2.get_support(indices=True):
    print(vectoriser.get_feature_names()[feat_num])

brisbane
melbourne
perth
sydney
u0627
u0644
u0645
u0646
u0648
u064a


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

models = [GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(max_depth=1),
          KNeighborsClassifier(n_neighbors=1),
          KNeighborsClassifier(n_neighbors=5),
          DecisionTreeClassifier(max_depth=None)]
#          svm.LinearSVC(C=C),
#          svm.SVC(kernel='rbf', gamma=0.7, C=C),
#          svm.SVC(kernel='poly', degree=3, C=C)]
titles = ['GNB',
          'MNB',
          'one-r',
          '1-nearest neighbour',
          '5-nearest neighbour',
          'Decision Tree']
#          'LinearSVC',
#          'SVM with a cubic kernel',
#          'SVM with an RBF kernel']

k = 100

x2 = SelectKBest(chi2, k=k)
x2.fit(X_train,y_train)
X_train_x2 = x2.transform(X_train)
X_test_x2 = x2.transform(X_test)


Xs = [(X_train_x2, X_test_x2)]
X_names = ['x2']
for title, model in zip(titles, models):
    for X_name, X in zip(X_names, Xs):
        X_train_t, X_test_t = X
        model.fit(X_train_t.todense(), y_train)
        acc = model.score(X_test_t.todense(), y_test)
        print('k', k, title, 'features', X_name, 'acc',  acc)

k 100 GNB features x2 acc 0.30107238605898123
k 100 MNB features x2 acc 0.30053619302949064
k 100 one-r features x2 acc 0.26246648793565686
k 100 1-nearest neighbour features x2 acc 0.2975871313672922
k 100 5-nearest neighbour features x2 acc 0.2988471849865952
k 100 Decision Tree features x2 acc 0.30343163538873996


In [6]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_train_x2, y_train, cv=kfold)
print(results.mean())

0.3087848297213622
