# Tf-idf Vectorizer.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

## Load train and test data

In [15]:
train = pd.read_csv('train_pos.csv')
test = pd.read_csv('test_pos.csv')

print(train.head())
print(test.head())

   URL                                           Articles
0    0  microorg tiny seen ey distribut world ev insid...
1    1  download vimeo video vimeo consid greatest sou...
2    0  ask quest dat sci march min read not edit thin...
3    0  almost year ago day ryanair europ largest low ...
4    0  stop heard min industry stat slow econom decli...
   URL                                           Articles
0    1  walt car mak sur mad first day new job company...
1    0  provid cbc elect campaign commun littl scheer ...
2    1  sport ml leagu season play play big top play s...
3    1  beach plac day good company beach lik peopl or...
4    0  forc withdraw northern syr sunday iraq kurd te...


## Create independent and dependent variables

In [16]:
X_train = train['Articles']
y_train = train['URL']

X_test = test['Articles']
y_test = test['URL']

print('X_train.shape: {}'.format(X_train.shape))
print('X_test.shape: {}'.format(X_test.shape))

X_train.shape: (901,)
X_test.shape: (36,)


## Create a Tfidf object

In [17]:
# # Create a count vectorizer object - word level
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\b[^\d\W]+\b')

# Create a count vectorizer object - ngram level
# vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\b[^\d\W]+\b', ngram_range=(1,5))

# # Create a count vectorizer object - char level
# vectorizer = TfidfVectorizer(analyzer='char', token_pattern=r'\b[^\d\W]+\b', ngram_range=(3,5))

## Fit and transform

In [18]:
# Fit
vectorizer.fit(X_train)

# Create vectors.
Xtrain = vectorizer.transform(X_train).toarray()
Xtest = vectorizer.transform(X_test).toarray()

print(len(vectorizer.get_feature_names()))
print(len(vectorizer.vocabulary_))

4605
4605


## Function to train and test

In [19]:
def calculate_efficiency(model, Xtrain, y_train, Xtest, y_test):    
    spam_model = model.fit(Xtrain, y_train)
    
    # Predict and calulate efficiency
    y_pred = spam_model.predict(Xtest)

    conf_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    print('Model:\n{}'.format(model))
    print('\nConfusion Matrix:\n{}'.format(conf_mat))
    print('\nAccuracy:\n{}'.format(accuracy))    

## Naive Bayes Calssifier

In [20]:
from sklearn.naive_bayes import MultinomialNB
modelNB = MultinomialNB()
calculate_efficiency(modelNB, Xtrain, y_train, Xtest, y_test)

Model:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Confusion Matrix:
[[20  0]
 [14  2]]

Accuracy:
0.6111111111111112


## Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression(solver='lbfgs')
calculate_efficiency(modelLR, Xtrain, y_train, Xtest, y_test)

Model:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Confusion Matrix:
[[20  0]
 [10  6]]

Accuracy:
0.7222222222222222


## SVM Model

In [22]:
from sklearn import svm
modelSVM = svm.SVC(gamma='scale')
calculate_efficiency(modelSVM, Xtrain, y_train, Xtest, y_test)

Model:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Confusion Matrix:
[[19  1]
 [11  5]]

Accuracy:
0.6666666666666666


## Bagging Model - RFC

In [23]:
from sklearn.ensemble import RandomForestClassifier
modelRFC = RandomForestClassifier(n_estimators=100, max_depth=2)
calculate_efficiency(modelLR, Xtrain, y_train, Xtest, y_test)                        

Model:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Confusion Matrix:
[[20  0]
 [10  6]]

Accuracy:
0.7222222222222222


## Boosting Model - XGB

In [24]:
import xgboost
from xgboost import XGBClassifier
modelXGB = xgboost.XGBClassifier()
calculate_efficiency(modelXGB, Xtrain, y_train, Xtest, y_test)                        

Model:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

Confusion Matrix:
[[19  1]
 [ 3 13]]

Accuracy:
0.8888888888888888
