In [2]:
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
stop_words = stopwords.words("english")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("/case/wiki_movie.csv")

In [4]:
data.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....


In [5]:
data['Origin/Ethnicity'].value_counts()

American        17377
British          3670
Bollywood        2931
Tamil            2599
Telugu           1311
Japanese         1188
Malayalam        1095
Hong Kong         791
Canadian          723
Australian        576
South_Korean      522
Chinese           463
Kannada           444
Bengali           306
Russian           232
Marathi           141
Filipino          128
Bangladeshi        87
Punjabi            84
Turkish            70
Malaysian          70
Egyptian           67
Assamese            9
Maldivian           2
Name: Origin/Ethnicity, dtype: int64

In [6]:
data.iloc[0,7]

"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"

### Removing Stop words

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
def remove_swords(txt_data):
    txt_clean = [word for word in text_data if word not in stopwords]
    return txt_clean

data['plot_no_sw'] = data['Plot'].apply(lambda x: remove_stopwords(x))
data.head()

# Vector Space Model

In [8]:
##Count Vectorizer
vectorizer = CountVectorizer(analyzer = "word", max_features = 20, max_df=0.3)
count_model = vectorizer.fit(data["Plot"])
X = count_model.transform(data["Plot"])

In [9]:
count_model.get_feature_names()

['being',
 'can',
 'day',
 'family',
 'father',
 'find',
 'finds',
 'goes',
 'had',
 'home',
 'house',
 'however',
 'man',
 'mother',
 'new',
 'off',
 'police',
 'so',
 'tells',
 'wife']

In [10]:
X.todense()[:20]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 9, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0,

In [11]:
#Tfidf Vectorizer
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 20, max_df=0.3)
tfidf_model = vectorizer.fit(data["Plot"])
X = tfidf_model.transform(data["Plot"])

In [12]:
tfidf_model.get_feature_names()

['being',
 'can',
 'day',
 'family',
 'father',
 'find',
 'finds',
 'goes',
 'had',
 'home',
 'house',
 'however',
 'man',
 'mother',
 'new',
 'off',
 'police',
 'so',
 'tells',
 'wife']

In [13]:
X.todense()[:5]

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        

Full Model

In [14]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features = 1000)
tfidf_model = vectorizer.fit(data["Plot"])
pickle.dump(tfidf_model, open("/case/tfidf.pkl", "wb"))
X = tfidf_model.transform(data["Plot"])
X_train,X_test,y_train,y_test = train_test_split(X,data["Origin/Ethnicity"],test_size = 0.1)
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
pickle.dump(clf, open("/case/text_clf.pkl", 'wb'))
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))



              precision    recall  f1-score   support

    American       0.67      0.96      0.79      1752
  Australian       0.50      0.03      0.05        40
 Bangladeshi       0.00      0.00      0.00        10
     Bengali       0.00      0.00      0.00        26
   Bollywood       0.56      0.56      0.56       315
     British       0.53      0.24      0.33       372
    Canadian       0.00      0.00      0.00        79
     Chinese       0.00      0.00      0.00        43
    Egyptian       0.00      0.00      0.00         4
    Filipino       0.00      0.00      0.00        15
   Hong Kong       0.71      0.16      0.26        74
    Japanese       0.68      0.20      0.31       129
     Kannada       0.00      0.00      0.00        43
   Malayalam       0.39      0.16      0.23       100
   Malaysian       0.00      0.00      0.00         4
     Marathi       0.00      0.00      0.00        18
     Punjabi       0.00      0.00      0.00         9
     Russian       0.00    

  'precision', 'predicted', average, warn_for)


In [None]:


tf_model = pickle.load(open("case/tfidf.pkl", 'rb'))

new_data = [""]
X = tfidf_model.transform(new_data)
loaded_clf = pickle.load(open("case/text_clf.pkl", 'rb'))
preds = loaded_clf.predict_proba(X)
print(loaded_clf.classes_)
print(preds)

