In [380]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import svm

from sklearn.multiclass import OneVsRestClassifier


In [381]:
df = pd.read_csv('train_data_merged_labels.csv')
df.head()
df['Core Relations'] = df['Core Relations'].replace({'movie.estimated_budget':'movie.estimated.budget', 
                                             'movie.directed_by':'movie.directed.by', 
                                             'movie.gross_revenue':'movie.gross.revenue'
                                             ,'movie.initial_release_date':'movie.initial.release.date'
                                             ,'movie.produced_by':'movie.produced.by'
                                             ,'movie.production_companies': 'movie.production.companies'})

df = df.drop_duplicates()
df.columns=['utterances','relations']
print(len(df))
df_new = df[df.relations != 'none']
print(len(df_new))
df_test = pd.read_csv('test_data.csv')
df.head()


2163
1861


Unnamed: 0,utterances,relations
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character
1,show credits for the godfather,movie.starring.actor
2,who was the main actor in the exorcist,movie.starring.actor
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character
4,who was the female lead in resident evil,actor.gender_movie.starring.actor


In [382]:
newTags = []
for i in df_new['relations']:
  newTags.append(i.split('_'))


df_new['newTags'] = newTags
df_new_2 = df_new[~(df_new['newTags'].str.len() == 0)]
df_new_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,utterances,relations,newTags
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]"
1,show credits for the godfather,movie.starring.actor,[movie.starring.actor]
2,who was the main actor in the exorcist,movie.starring.actor,[movie.starring.actor]
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]"
4,who was the female lead in resident evil,actor.gender_movie.starring.actor,"[actor.gender, movie.starring.actor]"


In [383]:
import re
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [384]:
df_new_2['clean_utterances'] = df_new_2['utterances'].apply(lambda x: clean_text(x))
df_new_2.head()



Unnamed: 0,utterances,relations,newTags,clean_utterances
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]",who plays luke on star wars new hope
1,show credits for the godfather,movie.starring.actor,[movie.starring.actor],show credits for the godfather
2,who was the main actor in the exorcist,movie.starring.actor,[movie.starring.actor],who was the main actor in the exorcist
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]",who played dory on finding nemo
4,who was the female lead in resident evil,actor.gender_movie.starring.actor,"[actor.gender, movie.starring.actor]",who was the female lead in resident evil


In [385]:
df_test['clean_utterances'] = df_test['utterances'].apply(lambda x: clean_text(x))
df_test.head()

Unnamed: 0,utterances,clean_utterances
0,star of thor,star of thor
1,who is in the movie the campaign,who is in the movie the campaign
2,list the cast of the movie the campaign,list the cast of the movie the campaign
3,who was in twilight,who was in twilight
4,who is in vulguria,who is in vulguria


In [386]:
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
ps = PorterStemmer()
  
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [ps.stem(w) for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

df_new_2['clean_utterances'] = df_new_2['clean_utterances'].apply(lambda x: remove_stopwords(x))
df_new_2.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,utterances,relations,newTags,clean_utterances
0,who plays luke on star wars new hope,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]",play luke star war new hope
1,show credits for the godfather,movie.starring.actor,[movie.starring.actor],show credit godfath
2,who was the main actor in the exorcist,movie.starring.actor,[movie.starring.actor],main actor exorcist
3,who played dory on finding nemo,movie.starring.actor_movie.starring.character,"[movie.starring.actor, movie.starring.character]",play dori find nemo
4,who was the female lead in resident evil,actor.gender_movie.starring.actor,"[actor.gender, movie.starring.actor]",femal lead resid evil


In [387]:
df_test['clean_utterances'] = df_test['clean_utterances'].apply(lambda x: remove_stopwords(x))
df_test.head()

Unnamed: 0,utterances,clean_utterances
0,star of thor,star thor
1,who is in the movie the campaign,movi campaign
2,list the cast of the movie the campaign,list cast movi campaign
3,who was in twilight,twilight
4,who is in vulguria,vulguria


In [388]:
xtrain1 = df_new_2['clean_utterances']
ytrain1 = df_new_2['newTags']

In [389]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
#multilabel_binarizer.fit(df_new_2['newTags'])

# transform target variable
y = multilabel_binarizer.fit_transform(df_new_2['newTags'])
print(y)
print(multilabel_binarizer.classes_)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['actor.gender' 'birth' 'budget' 'by' 'companies' 'date' 'gr.amount'
 'movie.country' 'movie.directed' 'movie.directed.by' 'movie.estimated'
 'movie.estimated.budget' 'movie.genre' 'movie.gross'
 'movie.gross.revenue' 'movie.initial' 'movie.initial.release.date'
 'movie.language' 'movie.locations' 'movie.music' 'movie.produced'
 'movie.produced.by' 'movie.production' 'movie.production.companies'
 'movie.rating' 'movie.starring.actor' 'movie.starring.character'
 'movie.subjects' 'of' 'person.date' 'release' 'revenue']


In [390]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',max_df=0.8, max_features=1000)

In [391]:
xtrain, xval, ytrain, yval = train_test_split(df_new_2['clean_utterances'], y, test_size=0.2, random_state=0)

In [392]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

xtest_tfidf = tfidf_vectorizer.transform(df_test['clean_utterances'])

In [393]:
from sklearn.ensemble import RandomForestClassifier
import csv
# Performance metric
from sklearn.metrics import f1_score, accuracy_score

#clfrf=RandomForestClassifier(n_estimators=100)
clfrf=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None,min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                             max_leaf_nodes=None,bootstrap=True, oob_score=False, 
                             n_jobs=1, random_state=None, verbose=0, warm_start=False,class_weight=None)


clfrf.fit(xtrain_tfidf, ytrain)


# make predictions for validation set
y_pred = clfrf.predict(xval_tfidf)
y_pred1 = clfrf.predict(xtest_tfidf)
with open('sample_kaggle_submission_RandomForest.csv', 'w') as f:
  writer = csv.writer(f, delimiter = ',')
  for i in range(len(y_pred1)):
    prediction = '_'.join(multilabel_binarizer.inverse_transform(y_pred1)[i])
    if prediction == "movie.produced.by":
      prediction = "movie.produced_by"
    elif prediction == "movie.estimated.budget":
      prediction = "movie.estimated_budget"
    elif prediction == "movie.directed.by":
      prediction = "movie.directed_by"
    elif prediction == "movie.gross.revenue":
      prediction = "movie.gross_revenue"
    elif prediction == "movie.initial.release.date":
      prediction = "movie.initial_release_date"
    elif prediction == "movie.production.companies":
      prediction = "movie.production_companies"
    writer.writerow([i, prediction])

f1_score(yval, y_pred, average="micro")
#accuracy_score(y_pred,yval)

0.7718383311603649

In [394]:
from sklearn.tree import DecisionTreeClassifier

clfdt = DecisionTreeClassifier()

clfdt.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
y_pred = clfdt.predict(xval_tfidf)
y_pred2 = clfdt.predict(xtest_tfidf)

with open('sample_kaggle_submission_DecisionTree.csv', 'w') as f:
  writer = csv.writer(f, delimiter = ',')
  for i in range(len(y_pred2)):

    prediction = '_'.join(multilabel_binarizer.inverse_transform(y_pred2)[i])
    if prediction == "movie.produced.by":
      prediction = "movie.produced_by"
    elif prediction == "movie.estimated.budget":
      prediction = "movie.estimated_budget"
    elif prediction == "movie.directed.by":
      prediction = "movie.directed_by"
    elif prediction == "movie.gross.revenue":
      prediction = "movie.gross_revenue"
    elif prediction == "movie.initial.release.date":
      prediction = "movie.initial_release_date"
    elif prediction == "movie.production.companies":
      prediction = "movie.production_companies"
    writer.writerow([i, prediction])

#f1_score(yval, y_pred, average="micro")
accuracy_score(y_pred,yval)

0.7184986595174263