# Emoji Prediction
## A v quick intro to the scikit-learn machine learning library 

In [1]:
# standard library
import sys
import time
# numpy
import numpy as np
# scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# scikit classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
import joblib

In [3]:
# read data
tweets = open("datasets/test_tweets.txt", encoding="utf8")
emojis = open("datasets/test_labels.txt", encoding="utf8")
tweets_orig = open("datasets/test_tweets.txt", encoding="utf8")

# convert text docs into a matrix of token counts
count_vect = CountVectorizer()
term_doc_matrix = count_vect.fit_transform(tweets)

print(count_vect.get_feature_names())

['abwyman', 'and', 'at', 'baby', 'be', 'bear', 'bingo', 'bj', 'blacchyna', 'black', 'blueeyes', 'brewhouse', 'bubby', 'cafe', 'california', 'card', 'catching', 'cathedral', 'christmas2016', 'covina', 'done', 'drag', 'ellis', 'explore', 'festive', 'fun', 'got', 'hell', 'her', 'history', 'island', 'just', 'la', 'light', 'like', 'little', 'lol', 'lovely', 'lupusgirl', 'makeup', 'modeling', 'modelingagency', 'mufffffaaaaaka', 'my', 'of', 'office', 'out', 'photography', 'preparatory', 'queens', 'race', 'redrock', 'restaurant', 'rupaul', 'rupaulsdragrace', 'school', 'sexy', 'sister', 'smiling', 'so', 'soul', 'step', 'the', 'things', 'thru', 'trap', 'university', 'up', 'user', 'victoria', 'west', 'with', 'yugioh']


In [4]:
print(term_doc_matrix.toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
  0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0]
 [0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0]
 [1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0]
 [0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 2 0
  0 0 0 

In [5]:
# transorm a count matrix to a normalized tf or idf
tfidf_transformer = TfidfTransformer()
normalized_tdm = tfidf_transformer.fit_transform(term_doc_matrix)

print(normalized_tdm.toarray())


[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5        0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.5        0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5        0.
  0.        ]
 [0.         0.         0.33333333 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.33333333 0.         0.         0.         0.         0.
  0.33333333 0.         0.

In [6]:
# put emojis into an array
target = emojis.read().split('\n')
print(target)

['2', '17', '0', '18', '1', '9', '2', '0', '8', '13']


In [7]:
classifier = MLPClassifier().fit(normalized_tdm, target)

