### Imports

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

### Loading data

In [3]:

train_data = pd.read_csv(open(".\\data\\train_features.tsv", encoding="utf8"), sep='\t')
train_labels = pd.read_csv(open(".\\data\\train_labels.tsv", encoding='utf8'), sep = '\t')
valid_data = pd.read_csv(open(".\\data\\valid_features.tsv", encoding="utf8"), sep='\t')
valid_labels = pd.read_csv(open(".\\data\\valid_labels.tsv", encoding="utf8"), sep='\t')

In [36]:
test_data = pd.read_csv(open('test_features.csv', encoding='utf8'))

### Predictions for kaggle competition

In [4]:
def predictions(test_data, model):
    #Unpacks the model and count vectoriser, and transforms the test instances into a format usable by the model
    modelLR, vectoriser = model
    tag_data = pd.DataFrame(vectoriser.transform(test_data['tag']).toarray())
    av_data = test_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Predictions are made, and the result saved to file
    y_pred = modelLR.predict(data_transformed)
    df = pd.DataFrame()
    df['movieId'] = test_data['movieId']
    df['genres'] = y_pred
    df.to_csv('predictions.csv', index=False)

# Naive Bayes baseline

In [4]:
def trainNB(train_data, train_labels):
    #Convert tags to a vector of word frequencies
    vectoriser = CountVectorizer()
    X = vectoriser.fit_transform(train_data['tag'])
    
    #Train Multinomial NB model.
    nbModel = MultinomialNB()
    nbModel.fit(X, train_labels['genres'])
    return (nbModel, vectoriser)

In [5]:
def evaluateNB(model, valid_data, valid_labels):
    
    #Unpack model and vectoriser, and transform data to useable format
    nbModel, vectoriser = model
    X = vectoriser.transform(valid_data['tag'])
    
    #Predictions made, evaluation metrics calculated.
    y_pred = nbModel.predict(X)
    y_true = valid_labels['genres']
    f1_score_weighted = f1_score(y_true, y_pred, average='weighted')
    acc = nbModel.score(X, y_true)
    labels = nbModel.classes_
    
    #Evaluation results printed
    print(labels)
    print(confusion_matrix(y_true,y_pred,labels=labels))    
    print('Weighted f-score: ', f1_score_weighted)
    print('Total accuracy: ', acc) 
    return
    

### Example usage of Naive Bayes baseline

In [6]:
modelNB = trainNB(train_data, train_labels)
evaluateNB(modelNB, valid_data, valid_labels)

['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film_Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci_Fi' 'Thriller' 'War' 'Western']
[[ 0  0  0  0  1  0  0  3  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  1  1  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  1 19  0  0  4  0  0  0  0  0 13  1  0  0  0]
 [ 0  0  0  0  1  0  1  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  1  0  8  7  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  5  0  3 24  0  0  0  0  0  6  2  3  0  0]
 [ 1  0  0  1  2  0  0  2  6  0  3  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  1  0  0  2  0  0  3  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  0  3  1  1  0  0  1  0  4  0  0  0  0]
 [ 0  0  0  0  0  2  0  6  0  0  1  0  2  0  2  5  0  0]
 [ 0  0  0  0  8  0  0 13  0  0  1  0  0 24  2  3  0  0]
 [ 0  0  0  0  1  

  'precision', 'predicted', average, warn_for)


# Logistic Regression


In [6]:
def trainLR(train_data, train_labels):
    #Convert tags to a vector of word frequencies, concatenate with audio-visual data
    vectoriser = CountVectorizer()
    tag_data = pd.DataFrame(vectoriser.fit_transform(train_data['tag']).toarray())
    av_data = train_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Train logistic regression model, with lasso regularisation. 
    modelLR = LogisticRegression(random_state=0, penalty='l1').fit(data_transformed, train_labels['genres'])
    return (modelLR, vectoriser)

In [7]:
def evaluateLR(model, valid_data, valid_labels):
    #Unpack model and vectoriser, transform data into useable format
    modelLR, vectoriser = model
    tag_data = pd.DataFrame(vectoriser.transform(valid_data['tag']).toarray())
    av_data = valid_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Make predictions and calculate evaluation metrics
    y_pred = modelLR.predict(data_transformed)
    y_true = valid_labels['genres']
    acc = modelLR.score(data_transformed, y_true)
    labels = list(set(valid_labels['genres']))
    precision = pd.DataFrame(precision_score(y_true, y_pred, labels=labels, average=None))
    recall = pd.DataFrame(recall_score(y_true, y_pred, labels=labels, average=None))
    prec_recall = pd.concat([precision, recall], axis=1)
    prec_recall.columns = ['precision', 'recall']
    prec_recall.index = labels
    f1_score_weighted = f1_score(y_true, y_pred, average='weighted')
    
    #Print evaluation results.
    print(labels)
    print(confusion_matrix(y_true,y_pred,labels=labels))
    print(prec_recall)
    print('Weighted f-score: ', f1_score_weighted)
    print('Total accuracy: ', acc) 
    return

### Logistic Regression model with weightings to balance dataset

In [8]:
def trainLRBalanced(train_data, train_labels):
    #Convert tags to a vector of word frequencies and concatenate with AV data.
    vectoriser = CountVectorizer()
    tag_data = pd.DataFrame(vectoriser.fit_transform(train_data['tag']).toarray())
    av_data = train_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Train model using balanced weightings.
    modelLR = LogisticRegression(random_state=0, class_weight='balanced', penalty='l1').fit(data_transformed, train_labels['genres'])
    return (modelLR, vectoriser)

### Evaluation that takes top two label predictions, and outputs true if correct label is one of them.

In [9]:
def evaluateMultiLR(model, valid_data, valid_labels):
    #Unpacks model and vectoriser, concatenates with AV data.
    modelLR, vectoriser = model
    tag_data = pd.DataFrame(vectoriser.transform(valid_data['tag']).toarray())
    av_data = valid_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Makes two predictions for each instance
    y_prob = modelLR.predict_proba(data_transformed)
    y_pred = []
    for row in y_prob:
        row = list(enumerate(row))
        row = sorted(row, reverse=True, key=lambda x:x[1])
        y_pred.append((modelLR.classes_[row[0][0]], modelLR.classes_[row[1][0]]))
    
    #Calculates accuracy where it is correct if true label is one of the two predicted classes.
    count = 0
    correct = 0
    for i in range(len(y_pred)):
        count += 1
        if valid_labels['genres'][i] in y_pred[i]:
            correct += 1
    acc = correct / count
    print(acc)
    return

### Usage of logistic regression functions

In [10]:
#Training regular model LR1
modelLR = trainLR(train_data,train_labels)



In [14]:
#Evaluation of model LR1 on validation data
evaluateLR(modelLR, valid_data, valid_labels)

['Comedy', 'Children', 'Animation', 'Romance', 'Musical', 'Action', 'Horror', 'Crime', 'Film_Noir', 'Fantasy', 'Thriller', 'Western', 'Drama', 'Sci_Fi', 'Mystery', 'War', 'Documentary', 'Adventure']
[[22  0  0  7  1  0  0  0  0  2  1  0  4  0  1  0  0  0]
 [ 0  1  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 7  0  0 18  0  0  1  0  0  0  5  1 17  2  0  0  0  0]
 [ 2  0  0  2  1  0  0  0  0  2  0  0  1  0  0  0  2  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  1  0]
 [ 0  0  0  1  0  0  5  0  0  0  1  0  1  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  1  0  0  2  0  1  0  0  0  0  0]
 [ 1  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  1  1  4  0  0  6  0  0  3  0  0  0  0  0]
 [ 1  0  0  1  0  0  1  1  0  1 15  0  6  0  0  1  1  0]
 [ 2  0  1  1  1  0  0  0  0  0  0  0  1  0  0  0  1  0]
 [ 6  0  0  4  0  0  0  0  0  0  5  0 27  0  0  0  1  0]
 [ 0  1  0  2  0  0  0  0  0  0  1  0  2 10  0  0  0  0]
 [ 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
#Training model LR2 with weightings to balance dataset
modelLRBalanced = trainLRBalanced(train_data, train_labels)



In [17]:
#Evaluating model LR2
evaluateLR(modelLRBalanced, valid_data, valid_labels)

['Comedy', 'Children', 'Animation', 'Romance', 'Musical', 'Action', 'Horror', 'Crime', 'Film_Noir', 'Fantasy', 'Thriller', 'Western', 'Drama', 'Sci_Fi', 'Mystery', 'War', 'Documentary', 'Adventure']
[[16  1  1  3  1  3  1  1  1  2  0  2  3  0  1  0  1  1]
 [ 0  1  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  2  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 8  1  0 11  1  0  2  3  3  1  2  1 11  2  1  0  4  0]
 [ 1  1  0  2  2  0  0  0  0  1  0  0  0  0  0  0  2  1]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  1  0  0  1  2  0]
 [ 0  0  0  0  0  0  5  0  0  0  1  1  1  0  0  0  0  0]
 [ 1  0  0  0  0  1  0  2  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  2  1  5  0  0  6  0  0  2  0  0  0  0  0]
 [ 2  0  0  0  0  1  2  3  0  1  8  0  3  0  4  2  2  0]
 [ 2  0  1  0  1  0  0  1  0  0  0  1  0  0  0  0  1  0]
 [ 4  0  1  1  0  1  0  2  1  0  5  3 15  0  0  1  9  0]
 [ 0  3  0  0  0  0  0  1  1  0  0  1  2  8  0  0  0  0]
 [ 

In [18]:
#Evaluating original LR1 model but using multi-label evaluation (LR3)
evaluateMultiLR(modelLR, valid_data, valid_labels)

0.6153846153846154


# Decision Tree

In [19]:
def trainDT(train_data, train_labels):
    #Transforms tag data into count vector, and concatenates with AV data.
    vectoriser = CountVectorizer()
    tag_data = pd.DataFrame(vectoriser.fit_transform(train_data['tag']).toarray())
    av_data = train_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Trains model with decision tree, no restrictions on size
    modelDT = DecisionTreeClassifier(random_state=0, criterion='entropy')
    modelDT.fit(data_transformed, train_labels['genres'])
    
    return (modelDT, vectoriser)

In [20]:
def evaluateDT(model, valid_data, valid_labels):
    #Unpacks model and vectoriser, transforms data into useable format
    modelDT, vectoriser = model
    tag_data = pd.DataFrame(vectoriser.transform(valid_data['tag']).toarray())
    av_data = valid_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Makes predictions and calculates evaluation metrics
    y_pred = modelDT.predict(data_transformed)
    y_true = valid_labels['genres']
    acc = modelDT.score(data_transformed, y_true)
    f1_score_weighted = f1_score(y_true, y_pred, average='weighted')
    labels = modelDT.classes_
    
    #Prints evaluation results
    print(labels)
    print(confusion_matrix(y_true,y_pred,labels=labels))
    print('Weighted f-score: ', f1_score_weighted)
    print('Total accuracy: ', acc) 
    return

### Decision tree with restricted depth

In [21]:
def trainReducedDT(train_data, train_labels):
    #Transforms tag data into count vector, and concatenates with AV data.
    vectoriser = CountVectorizer()
    tag_data = pd.DataFrame(vectoriser.fit_transform(train_data['tag']).toarray())
    av_data = train_data.drop(columns=['movieId','title','YTId','year','tag'])
    data_transformed = pd.concat([tag_data,av_data],axis=1)
    
    #Trains model with decision tree, depth restricted to 7.
    modelDT = DecisionTreeClassifier(random_state=0, max_depth=7)
    modelDT.fit(data_transformed, train_labels['genres'])
    
    return (modelDT, vectoriser)

### Usage of decision tree functions

In [22]:
#Training of unrestricted decision tree
modelDT = trainDT(train_data,train_labels)

In [23]:
#Evaluation of unrestricted decision tree on training data
evaluateDT(modelDT, train_data, train_labels)

['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film_Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci_Fi' 'Thriller' 'War' 'Western']
[[ 86   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  30   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 106   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 583   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 237   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 207   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0 713   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 298   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  78   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 244   0   0   0   0   0 

In [24]:
#Evaluation of unrestricted decision tree on validation data
evaluateDT(modelDT, valid_data, valid_labels)

['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film_Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci_Fi' 'Thriller' 'War' 'Western']
[[ 0  0  0  0  1  0  0  0  0  0  0  0  0  2  1  1  0  1]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0]
 [ 1  0  0  0  1  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 3  1  1  0 11  0  2  6  1  0  1  0  2  7  1  2  0  0]
 [ 0  0  0  0  2  0  0  0  1  0  0  0  1  1  0  0  0  0]
 [ 0  1  0  1  2  1  4  5  0  0  0  1  1  0  0  1  1  0]
 [ 0  2  2  1  4  2  1  9  1  0  1  1  3  9  0  7  0  0]
 [ 0  0  0  0  0  0  0  3  4  0  2  0  0  4  4  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  3  1  0  0  0]
 [ 0  0  0  0  1  0  1  0  0  0  3  0  2  0  0  1  0  0]
 [ 0  0  0  0  1  2  2  0  0  0  1  1  0  1  2  0  0  0]
 [ 1  0  0  1  0  0  1  4  1  0  0  1  1  1  3  4  0  0]
 [ 1  3  0  0 11  2  0  6  3  0  1  1  1  8  5  6  3  0]
 [ 0  0  0  0  0  

  'precision', 'predicted', average, warn_for)


In [25]:
#Training of decision tree with restricted depth
modelDT_7 = trainReducedDT(train_data,train_labels)

In [26]:
#Evaluating restricted decision tree on training data
evaluateDT(modelDT_7, train_data, train_labels)

['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film_Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci_Fi' 'Thriller' 'War' 'Western']
[[  0   0   0   0  42   0   0   0   0   0   0   0   0   1   0  43   0   0]
 [  0   2   0   0  55   0   0   0   1   0   0   0   0   4   0  41   1   0]
 [  0   0   0   0  16   0   0   0   1   0   0   0   0   0   0  13   0   0]
 [  0   0   0   0  72   0   0   0   3   0   0   0   0   1   0  30   0   0]
 [  0   0   0   0 411   0   1   0   1   0   0   0   0   4   0 164   2   0]
 [  0   0   0   0 117   0   0   1   1   0   0   0   0   1   0 116   1   0]
 [  0   0   0   0  85   0  53   0   0   0   0   0   0   0   0  66   3   0]
 [  0   0   0   0 312   0   0   6   1   0   0   0   0   7   0 380   7   0]
 [  0   0   0   0  88   0   0   1  75   0   0   0   0   5   4 124   1   0]
 [  0   0   0   0  45   0   0   0   0   1   0   0   0   1   0  30   1   0]
 [  0   0   0   0  66   0   0   0   0   0   2   0   0   2   2 172 

  'precision', 'predicted', average, warn_for)


In [27]:
#Evaluating restricted decision tree on validation data
evaluateDT(modelDT_7, valid_data, valid_labels)

['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film_Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci_Fi' 'Thriller' 'War' 'Western']
[[ 0  0  0  0  5  0  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 26  0  0  0  0  0  0  0  0  1  0 11  0  0]
 [ 0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  9  0  3  0  0  0  0  0  0  0  0  6  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0  0  0 22  1  0]
 [ 0  0  0  0  3  0  0  0  4  0  0  0  0  2  1  8  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  7  0  0]
 [ 0  0  0  0  4  0  2  0  0  0  0  0  0  1  1  2  0  0]
 [ 0  0  0  0  5  0  0  0  0  0  0  0  0  0  2 11  0  0]
 [ 0  0  0  0 20  0  0  1  0  0  0  0  1  6  0 23  0  0]
 [ 0  0  0  0  3  

  'precision', 'predicted', average, warn_for)
