# Importing the Required Dependencies

In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import BernoulliNB 


# Preprocessing Training Data 

In [7]:
train_df = pd.read_csv('Datasets/train_data.txt', delimiter = ' ::: ', names=['Index','Title','Genre','Description'])
train_df.drop(columns='Index', inplace = True)
train_df

  train_df = pd.read_csv('Datasets/train_data.txt', delimiter = ' ::: ', names=['Index','Title','Genre','Description'])


Unnamed: 0,Title,Genre,Description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr..."
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...


In [8]:
train_df.shape

(54214, 3)

In [9]:
train_df['Genre'].unique()

array(['drama', 'thriller', 'adult', 'documentary', 'comedy', 'crime',
       'reality-tv', 'horror', 'sport', 'animation', 'action', 'fantasy',
       'short', 'sci-fi', 'music', 'adventure', 'talk-show', 'western',
       'family', 'mystery', 'history', 'news', 'biography', 'romance',
       'game-show', 'musical', 'war'], dtype=object)

# Encoding the Categorical Data 

In [10]:
encoder = LabelEncoder()
train_df['Genre'] = encoder.fit_transform(train_df['Genre'])

In [11]:
train_df['Genre'].unique()

array([ 8, 24,  1,  7,  5,  6, 18, 13, 22,  3,  0, 10, 21, 20, 14,  2, 23,
       26,  9, 16, 12, 17,  4, 19, 11, 15, 25])

In [12]:
train_df['Text'] = train_df['Title'] +' ' + train_df['Description']
train_df = train_df.drop(columns=['Title','Description'])
train_df

Unnamed: 0,Genre,Text
0,8,Oscar et la dame rose (2009) Listening in to a...
1,24,Cupid (1997) A brother and sister with a past ...
2,1,"Young, Wild and Wonderful (1980) As the bus em..."
3,8,The Secret Sin (1915) To help their unemployed...
4,8,The Unrecovered (2007) The film's title refers...
...,...,...
54209,5,"""Bonino"" (1953) This short-lived NBC live sitc..."
54210,13,Dead Girls Don't Cry (????) The NEXT Generatio...
54211,7,Ronald Goedemondt: Ze bestaan echt (2008) Ze b...
54212,5,Make Your Own Bed (1944) Walter and Vivian liv...


# Vectorizing the Train Data

In [32]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', min_df=1)
X_train = vectorizer.fit_transform(train_df['Text'])

In [14]:
Y_train = train_df['Genre']

model = LogisticRegression()
model.fit(X_train,Y_train)

# Preprocessing Test Data

In [16]:
test_df = pd.read_csv('Datasets/test_data.txt', delimiter=' ::: ', engine='python' , names= ['Index','Title','Description'])
test_df.drop(columns='Index',inplace=True)
test_df

Unnamed: 0,Title,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [21]:
test_df = pd.read_csv('Datasets/test_data.txt', delimiter=' ::: ', engine='python' , names= ['Index','Title','Description'])
test_df.drop(columns='Index',inplace=True)
test_df

Unnamed: 0,Title,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [22]:
test_df['Text'] = test_df['Title'] +' ' + test_df['Description']
test_df.drop(columns= ['Title','Description'],inplace=True)

In [23]:
X_test = vectorizer.transform(test_df['Text'])
print(X_test)

  (0, 132178)	0.07723030965930751
  (0, 131692)	0.14276455403103375
  (0, 127937)	0.7244174398692158
  (0, 123002)	0.08633363464426817
  (0, 122108)	0.12296593552308774
  (0, 121803)	0.12216957138278162
  (0, 115011)	0.08714985478228343
  (0, 114902)	0.08772626621554351
  (0, 114768)	0.10351821863181784
  (0, 109217)	0.1759504656544637
  (0, 108120)	0.1759504656544637
  (0, 103110)	0.18085129255090931
  (0, 100618)	0.08235403854445922
  (0, 96648)	0.11379111874935145
  (0, 85292)	0.1280424420556733
  (0, 77924)	0.10618191428802373
  (0, 77717)	0.08710264726281644
  (0, 71462)	0.1318099354638689
  (0, 71058)	0.09304909639354327
  (0, 69621)	0.0868701564675276
  (0, 61221)	0.07522719107292432
  (0, 48413)	0.09769911376906405
  (0, 47620)	0.08396775259802947
  (0, 42518)	0.0926163400072346
  (0, 39974)	0.19422518401035885
  :	:
  (54199, 70258)	0.057432247345554396
  (54199, 69684)	0.08702404787472318
  (54199, 58382)	0.1412151675068971
  (54199, 57481)	0.14008917772316354
  (54199, 53889

In [24]:
X_predict = model.predict(X_test)

# Preprocessing Solution Data

In [26]:
test_sol_df = pd.read_csv('Datasets/test_data_solution.txt', delimiter=' ::: ', engine='python', names=['Index','Title','Genre','Description'])
test_sol_df.drop(columns = 'Index', inplace = True)

test_sol_df

Unnamed: 0,Title,Genre,Description
0,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...
54195,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [27]:
test_sol_df['Genre'] = encoder.transform(test_sol_df['Genre'])

In [28]:
test_sol_df['Genre']

0        24
1         5
2         7
3         8
4         8
         ..
54195    13
54196    26
54197     1
54198     8
54199     8
Name: Genre, Length: 54200, dtype: int32

In [29]:
test_sol_df['Text'] = test_sol_df['Title'] + ' ' + test_sol_df['Description']
test_sol_df.drop(columns = ['Title','Description'], inplace = True)

test_sol_df

Unnamed: 0,Genre,Text
0,24,Edgar's Lunch (1998) L.R. Brane loves his life...
1,5,"La guerra de papá (1977) Spain, March 1964: Qu..."
2,7,Off the Beaten Track (2010) One year in the li...
3,8,"Meu Amigo Hindu (2015) His father has died, he..."
4,8,Er nu zhai (1955) Before he was known internat...
...,...,...
54195,13,"""Tales of Light & Dark"" (2013) Covering multip..."
54196,26,Der letzte Mohikaner (1965) As Alice and Cora ...
54197,1,Oliver Twink (2007) A movie 169 years in the m...
54198,8,"Slipstream (1973) Popular, but mysterious rock..."


In [30]:
Y_test = test_sol_df['Genre']

# Checking the Accuracy 

In [31]:
accuracy_score(Y_test,X_predict)

0.5887269372693726