In [3]:
import pandas as pd
from itertools import groupby

## Data -> Table

In [4]:
descriptions =[]

with open("how_people_describe_themselves.txt","r")  as f:
    for i in f:
        descriptions.append(i.rstrip('\n'))

In [5]:
desc_array = [list(group) for k, group in groupby(descriptions, lambda x: x == ' ') if not k]
    

In [6]:
for i in desc_array:
    i[1] = i[1].strip('Who you are: ')
    i[2] = i[2].strip('What you are like: ')
    i[3] = i[3].strip('What is the essence of what makes you YOU: ')

In [7]:
df = pd.DataFrame(desc_array, columns=['person', 'who', 'likes', 'essence'])
df

Unnamed: 0,person,who,likes,essence
0,Person 1,I am a 28-year-old woman living in Massachusetts.,I'm neurotic and eccentric but good at heart.,I am obsessed with rabbits and love technology.
1,Person 2,I am dedicated and persistent and willing to g...,'m fun loving and like to make people laugh by...,I am insightful and am able to fit together pi...
2,Person 3,"I am a feminist, a student, and a homosexual.","I am kind, nurturing, caring, and genuine.","My honesty, my loyalty, and my generosity make..."
3,Person 4,I'm an immigrant who became a citizen of the U...,"I can be stubborn and stand-offish, but I stil...","I'm introverted, so I need time alone to recha..."
4,Person 5,I'm a boring person,I'm quiet and introspective.,I have deep wells of sorrow.
5,Person 6,I am a girl who lives in New York City in my t...,I am very social and outgoing.,I am outgoing and care tremendously about my f...
6,Person 7,I am a mom of 3,"I like R &B Music, Japanese Food, Pizz",I am sympathetic.
7,Person 8,I am a licensed counselor.,I'm silly and funny.,I'm fun and sarcastic a lot of the time.
8,Person 9,"I'm Jim, a 53 year old american man","I'm forceful, happy, calm and occasionally gru...",I'm a drivan dominant at work and play. My wif...
9,Person 10,"I'm a fat, biracial bisexual woman.",I'm a funny depressive--aren't all depressives...,"I'm a boring drudge, not glamorous or fancy, b..."


## NLP

In [14]:
import nltk
from nltk.corpus import stopwords
from spellchecker import SpellChecker

nltk.download('stopwords')
spell = SpellChecker()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akrishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def preprocess(text):
    """Pre-processes the text, splits into tokens that are lower-cased, filtered and lemmatized."""

    # lowering
    # Retain alphabetic words: alpha_only
    # Remove all stop words: no_stops
    tokens = (t.lower() for t in nltk.word_tokenize(text)
                            if t.isalpha()
                            and t.lower() not in stopwords.words())

    # Lemmatize all tokens into a new list: lemmatized
    #wordnet_lemmatizer = nltk.WordNetLemmatizer()
    return [spell.correction(t) for t in tokens]

In [16]:
pos_tagged_desc = []
for i in desc_array:
    x = [nltk.pos_tag(preprocess(i[1])), nltk.pos_tag(preprocess(i[2])), nltk.pos_tag(preprocess(i[3]))]
    pos_tagged_desc.append(x)

In [50]:
pos_tagged_desc

[[[('woman', 'NN'), ('living', 'VBG'), ('massachusetts', 'NNS')],
  [('neurotic', 'JJ'), ('eccentric', 'RB'), ('good', 'JJ'), ('heart', 'NN')],
  [('obsessed', 'VBN'),
   ('rabbits', 'NNS'),
   ('love', 'VBP'),
   ('technology', 'NN')]],
 [[('dedicated', 'VBN'),
   ('persistent', 'JJ'),
   ('willing', 'JJ'),
   ('go', 'VB'),
   ('extra', 'JJ'),
   ('mil', 'NN')],
  [('fun', 'NN'),
   ('loving', 'VBG'),
   ('like', 'IN'),
   ('make', 'NN'),
   ('people', 'NNS'),
   ('laugh', 'IN'),
   ('telling', 'VBG'),
   ('jokes', 'NNS'),
   ('making', 'VBG'),
   ('light', 'JJ'),
   ('situations', 'NNS')],
  [('insightful', 'JJ'),
   ('able', 'JJ'),
   ('fit', 'NN'),
   ('together', 'RB'),
   ('pieces', 'NNS'),
   ('understand', 'VBP'),
   ('big', 'JJ'),
   ('picture', 'NN')]],
 [[('feminist', 'JJ'), ('student', 'NN'), ('homosexual', 'JJ')],
  [('kind', 'NN'), ('nurturing', 'VBG'), ('caring', 'VBG'), ('genuine', 'NN')],
  [('honesty', 'NN'),
   ('loyalty', 'NN'),
   ('generosity', 'NN'),
   ('make', 

In [18]:
descriptors = []
for i in pos_tagged_desc:
    s = ""
    for j in i:
        for k in j:
            s += k[0]+" "
    descriptors.append(s)

In [19]:
descriptors

['woman living massachusetts neurotic eccentric good heart obsessed rabbits love technology ',
 'dedicated persistent willing go extra mil fun loving like make people laugh telling jokes making light situations insightful able fit together pieces understand big picture ',
 'feminist student homosexual kind nurturing caring genuine honesty loyalty generosity make ',
 'immigrant became citizen united states american stubborn still much friends family nerd love video games prefer stay home introverted need time alone recharge calibrate ',
 'boring person quiet introspective deep wells sorrow ',
 'girl lives new york city twenties incredibly close family social outgoing outgoing tremendously friends ',
 'mom like r b music japanese food pizza sympathetic ',
 'licensed counsellor silly funny fun sarcastic lot time ',
 'year old american forceful happy calm occasionally grumpy touch sarcasm driven dominant work play wife essential life lost without grounding ',
 'fat racial bisexual woman fu

## TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### For each description as a document

In [93]:
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(descriptors)

df_idf = pd.DataFrame(X[2].T.todense(), index=vectorizer.get_feature_names(),columns=["idf_weights"]).sort_values('idf_weights', ascending=False)[:20]
df_idf.sort_values(by=['idf_weights'], ascending=False)[:10]

Unnamed: 0,idf_weights
feminist,0.337769
generosity,0.337769
homosexual,0.337769
loyalty,0.337769
genuine,0.337769
nurturing,0.305459
honesty,0.305459
student,0.282534
make,0.250224
kind,0.23794


#### For each of the questions as a document

In [60]:
who = []
likes = []
essence = []
for i in range(len(pos_tagged_desc)):
        who.append(pos_tagged_desc[i][0])
        likes.append(pos_tagged_desc[i][1])
        essence.append(pos_tagged_desc[i][2])

In [66]:
who_list = []
for i in who:
    s = ""
    for j in i:
            s += j[0]+" "
    who_list.append(s)
    
likes_list = []
for i in likes:
    s = ""
    for j in i:
            s += j[0]+" "
    likes_list.append(s)
    
essence_list = []
for i in essence:
    s = ""
    for j in i:
            s += j[0]+" "
    essence_list.append(s)

In [90]:
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(who_list)

df_idf = pd.DataFrame(X[10].T.todense(), index=vectorizer.get_feature_names(),columns=["idf_weights"]).sort_values('idf_weights', ascending=False)[:20]
df_idf.sort_values(by=['idf_weights'], ascending=False)[:5]

Unnamed: 0,idf_weights
adventurous,0.57735
skydiver,0.57735
geek,0.57735
persistent,0.0
new,0.0


In [91]:
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(likes_list)

df_idf = pd.DataFrame(X[10].T.todense(), index=vectorizer.get_feature_names(),columns=["idf_weights"]).sort_values('idf_weights', ascending=False)[:20]
df_idf.sort_values(by=['idf_weights'], ascending=False)[:5]

Unnamed: 0,idf_weights
life,0.7889
love,0.614521
pretty,0.0
prefer,0.0
play,0.0


In [92]:
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(essence_list)

df_idf = pd.DataFrame(X[10].T.todense(), index=vectorizer.get_feature_names(),columns=["idf_weights"]).sort_values('idf_weights', ascending=False)[:20]
df_idf.sort_values(by=['idf_weights'], ascending=False)[:5]

Unnamed: 0,idf_weights
intelligent,0.78704
caring,0.616902
play,0.0
place,0.0
pieces,0.0


## RAKE

In [75]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading https://files.pythonhosted.org/packages/8e/c4/b4ff57e541ac5624ad4b20b89c2bafd4e98f29fd83139f3a81858bdb3815/rake_nltk-1.0.4.tar.gz
Building wheels for collected packages: rake-nltk
  Building wheel for rake-nltk (setup.py) ... [?25ldone
[?25h  Created wheel for rake-nltk: filename=rake_nltk-1.0.4-py2.py3-none-any.whl size=7818 sha256=2dbcca8676767f9cb99586ab81d4f5cad8db229c6ad7bfd3677638749c1c9f3e
  Stored in directory: /Users/akrishna/Library/Caches/pip/wheels/ef/92/fc/271b3709e71a96ffe934b27818946b795ac6b9b8ff8682483f
Successfully built rake-nltk
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.4


In [89]:
from rake_nltk import Rake, Metric

r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
r.extract_keywords_from_text(desc_array[0][1])
r.get_ranked_phrases()

['old woman living', 'year', 'massachusetts', '28']

## Top-3

In [94]:
top3 = []
for i in X:
    top3_desc = pd.DataFrame(i.T.todense(), index=vectorizer.get_feature_names(),columns=["idf_weights"]).sort_values('idf_weights', ascending=False)
    top3.append(top3_desc.index[:3].tolist())

In [95]:
top3

[['technology', 'obsessed', 'massachusetts'],
 ['jokes', 'mil', 'dedicated'],
 ['feminist', 'generosity', 'homosexual'],
 ['home', 'calibrate', 'still'],
 ['sorrow', 'deep', 'wells'],
 ['outgoing', 'tremendously', 'incredibly'],
 ['japanese', 'pizza', 'sympathetic'],
 ['silly', 'counsellor', 'licensed'],
 ['touch', 'grounding', 'forceful'],
 ['bisexual', 'fancy', 'glamorous'],
 ['geek', 'skydiver', 'adventurous'],
 ['person', 'see', 'today'],
 ['know', 'allows', 'appreciate'],
 ['urban', 'passion', 'cities'],
 ['musician', 'laidback', 'tick'],
 ['aged', 'point', 'middle'],
 ['kind', 'loving', 'male'],
 ['cry', 'shoulder', 'listening'],
 ['important', 'adult', 'creative'],
 ['year', 'old', 'ability'],
 ['time', 'employed', 'agreeable'],
 ['energy', 'courteous', 'female'],
 ['hand', 'lend', 'give'],
 ['try', 'make', 'better'],
 ['pretty', 'husband', 'boring'],
 ['mal', 'everything', 'think'],
 ['desire', 'thing', 'inquisitive'],
 ['donald', 'manager', 'judge'],
 ['things', 'rather', 'ser

## NounProject

In [126]:
search_terms = top3[10]

In [127]:
search_terms

['geek', 'skydiver', 'adventurous']

In [150]:
import requests, json
from requests_oauthlib import OAuth1
from PIL import Image
import urllib.request

In [148]:
auth = OAuth1("1aac7c276b39401f9f042c53e8f8e5d6", "38b8f9bf4a074812918528889a076fa1")

for search_term in search_terms:
    endpoint = "http://api.thenounproject.com/icon/" + search_term
    response = requests.get(endpoint, auth=auth)

    parsed_response = json.loads(response.content.decode('utf8').replace("'", '"'))
    
    print(parsed_response)
    icon_url = parsed_response['icon']['preview_url']
    Image(url = icon_url, height=100, width=100)
    urllib.request.urlretrieve(icon_url, search_term + ".png")

{'icon': {'attribution': 'geek by iconsmind.com from Noun Project', 'attribution_preview_url': 'https://static.thenounproject.com/attribution/69709-600.png', 'collections': [{'author': {'location': 'London, GB', 'name': 'iconsmind.com', 'permalink': '/imicons', 'username': 'imicons'}, 'author_id': '438738', 'date_created': '2014-09-02 05:11:04', 'date_updated': '2014-09-02 05:23:49', 'description': '', 'id': '1009', 'is_collaborative': '', 'is_featured': '0', 'is_published': '1', 'is_store_item': '0', 'name': 'People', 'permalink': '/imicons/collection/people', 'slug': 'people', 'sponsor': {}, 'sponsor_campaign_link': '', 'sponsor_id': '', 'tags': ['line-icon', 'shape', 'illustration ', 'design', 'signs', 'people', 'abstract', 'person', 'figure'], 'template': '24'}], 'date_uploaded': '2014-09-02', 'id': '69709', 'is_active': '1', 'is_explicit': '0', 'license_description': 'creative-commons-attribution', 'nounji_free': '0', 'permalink': '/term/geek/69709', 'preview_url': 'https://static

In [162]:
def png2jpg(image_path):
    image = Image.open(image_path)
    new_image = Image.new("RGBA", image.size, "WHITE")
    new_image.paste(image, (0, 0), image)
    new_image.convert('RGB').save(image_path.split('.')[0] + '.jpg', "JPEG") 

In [163]:
for search_term in search_terms:
    png2jpg(search_term + '.png')

In [164]:
pngFilenames = [search_term + '.png' for search_term in search_terms]

## Layout

In [154]:
import math
from PIL import Image

def arrangeImagesInCircle(masterImage, imagesToArrange, radius):
    imgWidth, imgHeight = masterImage.size

#     diameter = min(
#         imgWidth  - max(img.size[0] for img in imagesToArrange),
#         imgHeight - max(img.size[1] for img in imagesToArrange)
#     )
#     radius = diameter / 2

    circleCenterX = imgWidth  / 2
    circleCenterY = imgHeight / 2
    theta = 2*math.pi / len(imagesToArrange)
    for i, curImg in enumerate(imagesToArrange):
        angle = i * theta
        dx = int(radius * math.cos(angle))
        dy = int(radius * math.sin(angle))

        pos = (
            int(circleCenterX + dx - curImg.size[0]/2),
            int(circleCenterY + dy - curImg.size[1]/2)
        )
        masterImage.paste(curImg, pos, mask=curImg)

In [167]:
img = Image.new("RGB", (600,600), "WHITE")

images = [Image.open(filename) for filename in pngFilenames]
arrangeImagesInCircle(img, images, 100)

img.convert('RGB').save("output.jpg", "JPEG")

## Detect Mood

In [174]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 1.8MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [177]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('text_emotion.csv')

data = data.drop('author', axis=1)

# Making all letters lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Punctuation, Symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# Removing Stop Words using NLTK
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Lemmatisation
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#Correcting Letter Repetitions

def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)
print(y)

# Splitting into training and testing data in 90:10 ratio
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

# Extracting TF-IDF parameters
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

# Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

[ 2 10 10 ...  7  5  7]


In [179]:
lbl_enc.classes_

array(['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'], dtype=object)

In [180]:
# # Model 1: Multinomial Naive Bayes Classifier
# nb = MultinomialNB()
# nb.fit(X_train_tfidf, y_train)
# y_pred = nb.predict(X_val_tfidf)
# print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))
# # naive bayes tfidf accuracy 0.5289017341040463

# # Model 2: Linear SVM
# lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
# lsvm.fit(X_train_tfidf, y_train)
# y_pred = lsvm.predict(X_val_tfidf)
# print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))
# # svm tfidf accuracy 0.5404624277456648

# # Model 3: logistic regression
# logreg = LogisticRegression(C=1)
# logreg.fit(X_train_tfidf, y_train)
# y_pred = logreg.predict(X_val_tfidf)
# print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))
# # log reg tfidf accuracy 0.5443159922928709

# # Model 4: Random Forest Classifier
# rf = RandomForestClassifier(n_estimators=500)
# rf.fit(X_train_tfidf, y_train)
# y_pred = rf.predict(X_val_tfidf)
# print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))
# # random forest tfidf accuracy 0.5385356454720617

# ## Building models using count vectors feature
# # Model 1: Multinomial Naive Bayes Classifier
# nb = MultinomialNB()
# nb.fit(X_train_count, y_train)
# y_pred = nb.predict(X_val_count)
# print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))
# # naive bayes count vectors accuracy 0.7764932562620424

# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))
# lsvm using count vectors accuracy 0.7928709055876686

# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))
# log reg count vectors accuracy 0.7851637764932563

# # Model 4: Random Forest Classifier
# rf = RandomForestClassifier(n_estimators=500)
# rf.fit(X_train_count, y_train)
# y_pred = rf.predict(X_val_count)
# print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))
# # random forest with count vectors accuracy 0.7524084778420038

naive bayes tfidf accuracy 0.2325
svm using tfidf accuracy 0.209




log reg tfidf accuracy 0.23775
random forest tfidf accuracy 0.22175
naive bayes count vectors accuracy 0.32775
lsvm using count vectors accuracy 0.3315




log reg count vectors accuracy 0.34925
random forest with count vectors accuracy 0.321


In [181]:
#Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness

tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
'Things are looking great. It was such a good day',
'Success is right around the corner. Lets celebrate this victory',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst, okay? But I am gonna get better.',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite depressing. I am filled with sorrow',
'His death broke my heart. It was a sad day'])

# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = lsvm.predict(tweet_count)
print(tweet_pred)
## result
## [0 0 0 0 1 1 1 1]

[ 5  5  5  7 12 12 10 10]


## Style Transfer