# NLP
Author Brian Tam, 10/16/2020

3. Prepping the words for NLP by:
    - Tokenizing with sklearn and spaCy
    - Lemmatisation
    - Count vectorizing words
    - Topic modeling

In [1]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import numpy as np
import pickle

# Import spacy to do NLP
import spacy

# Split the data into training and test sets
from sklearn.model_selection import train_test_split

# Import sklearn to do CountVectorizing
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

import matplotlib.pyplot as plt

# Topic Modeling
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

# Text Preprocessing
import re
import string

In [2]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'myers_briggs',    # DB that we are connecting to
}

connection = pg.connect(**connection_args)  # What is that "**" there??

query = "SELECT * FROM cleaned_posts;"

df = pd.read_sql(query, connection)
df

Unnamed: 0,type,I-E,N-S,T-F,J-P,post,post_no_links
0,INFJ,0,1,1,0,"""['http://www.youtube.com/watch?v=qsXHcwe3krw'...",What has been the most life-changing experienc...
1,ENTP,1,1,0,1,"""[""""I'm finding the lack of me in these posts ...",I'm finding the lack of me in these posts very...
2,INTP,0,1,0,1,"""['Good one _____ https://www.youtube.com/w...","Of course, to which I say I know; that's my bl..."
3,INTJ,0,1,0,0,"""['Dear INTP, I enjoyed our conversation the...","Dear INTP, I enjoyed our conversation the ot..."
4,ENTJ,1,1,0,0,"""[""""You're fired."""", """"That's another silly mi...",You're fired. That's another silly misconcepti...
...,...,...,...,...,...,...,...
8670,ISFP,0,0,1,1,"""['https://www.youtube.com/watch?v=t8edHB_h908...",Especially on websites that have become a have...
8671,ENFP,1,1,1,1,"""['So...if this thread already exists someplac...",Ooops...I guess I didn't look too hard because...
8672,INTP,0,1,0,1,"""['So many questions when i do these things. ...",So many questions when i do these things. I w...
8673,INFP,0,1,1,1,"""['I am very conflicted right now when it come...",I am very conflicted right now when it comes t...


In [4]:
null_data = df[df.isnull().any(axis=1)]

In [10]:
null_data

Unnamed: 0,type,I-E,N-S,T-F,J-P,post,post_no_links
3508,INFJ,0,1,1,0,"""['ENTP https://www.youtube.com/watch?v=oJwW...",


In [9]:
null_data.post[3508]

'"[\'ENTP   https://www.youtube.com/watch?v=oJwWmz8Mp3U\', \'IxFJ   https://www.youtube.com/watch?v=-8wjhgcAgsM&amp;feature=youtu.be\', \'The lyrics are totally ISFP.   https://www.youtube.com/watch?v=1xXykStD-YA\', \'ISFP   https://www.youtube.com/watch?v=-2d38N4LSW8\', \'INTJ   https://www.youtube.com/watch?v=QSVyyykaEOo&amp;feature=youtu.be\', \'INFJ, definitely.   https://www.youtube.com/watch?v=YxS4lqppZ6Y\', \'INFP   https://www.youtube.com/watch?v=IxfX_2dKrbE\', \'ESFP   https://www.youtube.com/watch?v=2PoLaX4IA_0\', \'ESTJ   https://www.youtube.com/watch?v=p8kjbjx2EUw&amp;feature=youtu.be\', \'ISFP 6w7   https://www.youtube.com/watch?v=G4w3zmpuhPI\', \'INFJ   https://www.youtube.com/watch?v=38by00DGid0\', \'ISFP?   https://www.youtube.com/watch?v=u3ivr41yxGs\', \'ISTP   https://www.youtube.com/watch?v=0HwC9gE7LbM\', \'IxFP   https://www.youtube.com/watch?time_continue=20&amp;v=crZI9fALH4k\', \'INTJ   https://www.youtube.com/watch?v=Ay4v7mhEh54&amp;feature=youtu.be\', \'ISFJ   h

Remove nulls and blank posts

In [4]:
df = df.dropna()

# SpaCy and Regex preprocessing

In [6]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Load English tokenizer, tagger, parser, NER and word vectors
parser = spacy.load('en_core_web_sm')

# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    # return preprocessed list of tokens
    return ' '.join(mytokens)

In [7]:
alphanumeric = lambda x: re.sub('\w*\d\w*', '', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())

In [8]:
df['spacy_post'] = df.post_no_links.apply(spacy_tokenizer) #.map(alphanumeric).map(punc_lower)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_post'] = df.post_no_links.apply(spacy_tokenizer) #.map(alphanumeric).map(punc_lower)


Unnamed: 0,type,I-E,N-S,T-F,J-P,post,post_no_links,clean_post
0,INFJ,0,1,1,0,"""['http://www.youtube.com/watch?v=qsXHcwe3krw'...",What has been the most life-changing experienc...,life change experience life perc experience im...
1,ENTP,1,1,0,1,"""[""""I'm finding the lack of me in these posts ...",I'm finding the lack of me in these posts very...,find lack post alarming sex boring position ex...
2,INTP,0,1,0,1,"""['Good one _____ https://www.youtube.com/w...","Of course, to which I say I know; that's my bl...",course know blessing curse absolutely positive...
3,INTJ,0,1,0,0,"""['Dear INTP, I enjoyed our conversation the...","Dear INTP, I enjoyed our conversation the ot...",dear intp enjoy conversation day esoteric gabb...
4,ENTJ,1,1,0,0,"""[""""You're fired."""", """"That's another silly mi...",You're fired. That's another silly misconcepti...,fire silly misconception approach logically ke...
...,...,...,...,...,...,...,...,...
8670,ISFP,0,0,1,1,"""['https://www.youtube.com/watch?v=t8edHB_h908...",Especially on websites that have become a have...,especially website haven neo nazis perc. nerd ...
8671,ENFP,1,1,1,1,"""['So...if this thread already exists someplac...",Ooops...I guess I didn't look too hard because...,ooop ... guess look hard start movies enfp wat...
8672,INTP,0,1,0,1,"""['So many questions when i do these things. ...",So many questions when i do these things. I w...,question thing purple pill pick win lottery nu...
8673,INFP,0,1,1,1,"""['I am very conflicted right now when it come...",I am very conflicted right now when it comes t...,conflicted right come want child honestly mate...


In [17]:
df['clean_post'] = df.spacy_post.map(alphanumeric).map(punc_lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_post'] = df.spacy_post.map(alphanumeric).map(punc_lower)


In [18]:
df

Unnamed: 0,type,I-E,N-S,T-F,J-P,post,post_no_links,clean_post,spacy_post
0,INFJ,0,1,1,0,"""['http://www.youtube.com/watch?v=qsXHcwe3krw'...",What has been the most life-changing experienc...,life change experience life perc experience im...,life change experience life perc experience im...
1,ENTP,1,1,0,1,"""[""""I'm finding the lack of me in these posts ...",I'm finding the lack of me in these posts very...,find lack post alarming sex boring position ex...,find lack post alarming sex boring position ex...
2,INTP,0,1,0,1,"""['Good one _____ https://www.youtube.com/w...","Of course, to which I say I know; that's my bl...",course know blessing curse absolutely positive...,course know blessing curse absolutely positive...
3,INTJ,0,1,0,0,"""['Dear INTP, I enjoyed our conversation the...","Dear INTP, I enjoyed our conversation the ot...",dear intp enjoy conversation day esoteric gabb...,dear intp enjoy conversation day esoteric gabb...
4,ENTJ,1,1,0,0,"""[""""You're fired."""", """"That's another silly mi...",You're fired. That's another silly misconcepti...,fire silly misconception approach logically ke...,fire silly misconception approach logically ke...
...,...,...,...,...,...,...,...,...,...
8670,ISFP,0,0,1,1,"""['https://www.youtube.com/watch?v=t8edHB_h908...",Especially on websites that have become a have...,especially website haven neo nazis perc nerd l...,especially website haven neo nazis perc. nerd ...
8671,ENFP,1,1,1,1,"""['So...if this thread already exists someplac...",Ooops...I guess I didn't look too hard because...,ooop guess look hard start movies enfp watch ...,ooop ... guess look hard start movies enfp wat...
8672,INTP,0,1,0,1,"""['So many questions when i do these things. ...",So many questions when i do these things. I w...,question thing purple pill pick win lottery nu...,question thing purple pill pick win lottery nu...
8673,INFP,0,1,1,1,"""['I am very conflicted right now when it come...",I am very conflicted right now when it comes t...,conflicted right come want child honestly mate...,conflicted right come want child honestly mate...


# Spacy-made-easy template

In [147]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string
punctuations = string.punctuation

from spacy.lang.en import English
parser = English()

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [148]:
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens
    

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

In [149]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Load sample data
train = list(zip(X_train,y_train))
test =  list(zip(X_test,y_test))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data = pipe.predict([x[0] for x in test]) 
for (sample, pred) in zip(test, pred_data):
    print(sample, pred)
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Train-Test Split

In [23]:
# Split the data into X and y data sets
X = df.clean_post
y = df['I-E']

In [55]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=34)

# CountVectorizer

In [61]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
cv1 = CountVectorizer(max_features=10, ngram_range=(1,2), binary=True, stop_words='english')

X_train_cv1 = cv1.fit_transform(X_train)
X_test_cv1  = cv1.transform(X_test)

pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names())

Unnamed: 0,feel,good,know,like,people,thing,think,time,want,way
0,1,1,1,1,1,1,1,1,1,1
1,0,1,1,1,1,1,1,0,1,1
2,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
6500,1,1,1,1,0,1,1,1,1,0
6501,1,1,1,1,1,1,1,1,1,1
6502,1,0,1,1,1,1,1,1,1,1
6503,1,1,1,1,1,1,1,1,1,0


### Try using TF-IDF instead of Count Vectorizer

In [62]:
# Create TF-IDF versions of the Count Vectorizers created earlier in the exercise
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf1 = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf1 = tfidf1.fit_transform(X_train)
X_test_tfidf1  = tfidf1.transform(X_test)

In [54]:
len(pd.DataFrame(X_train_tfidf1.toarray(), columns=tfidf1.get_feature_names()).columns)

74232

# Topic Modeling

### Latent Semantic Analysis (LSA) and nonnegative matrix factorization(NMF)

In [None]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = NMF(400)
doc_topic = lsa.fit_transform(pd.DataFrame(X_train_tfidf1.toarray(), columns=tfidf1.get_feature_names()))

In [64]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Takes in model and feature names and outputs 
    a list of string of the top words from each topic.
    """
    topics = []
    for ix, topic in enumerate(model.components_):
        topics.append(str(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])))
    return topics

In [65]:
topics = display_topics(lsa, tfidf1.get_feature_names(), 10)

In [66]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index =  topics,
             columns = tfidf1.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaagggghhh,...,아저씨,안녕하세요,유재석,ｅｘａｃｔｌｙ,ｎｅｓｓ,ｖａｐｏｒｗａｖｅ,ﾉﾞ,ﾉﾟ,ﾟﾉ,ﾟﾟ
"like, think, people, know, feel, thing, time, good, type, want",0.001,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"type, function, fe, fi, ni, ne, ti, entp, intj, se",-0.000,-0.000,-0.000,-0.000,0.000,-0.000,-0.000,-0.000,-0.000,-0.000,...,-0.000,0.0,-0.000,0.000,0.000,0.000,-0.000,0.000,0.000,-0.000
"enfp, infp, infj, enfj, love, friend, feel, lol, like, isfp",-0.002,0.000,0.001,0.000,0.000,0.000,-0.000,0.000,0.000,0.000,...,-0.000,0.0,-0.000,0.000,0.000,0.000,0.000,-0.000,-0.000,0.001
"tapatalk, send, iphone, use, gt, sm, tmlt, welcome, nexus, ipad",-0.000,0.000,-0.000,-0.000,-0.000,0.000,-0.000,0.000,0.000,0.000,...,0.000,0.0,-0.000,-0.000,0.000,0.000,-0.000,0.000,0.000,0.000
"welcome, entp, intp, intj, forum, enfp, hello, perc, istj, xd",0.003,0.000,-0.000,-0.000,0.000,0.000,0.000,0.000,0.000,-0.000,...,0.000,0.0,0.000,0.000,0.000,0.000,0.001,0.000,0.000,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"express, follow, edit, hell, stop, morning, shock, language, research, simple",-0.007,-0.000,0.003,0.001,-0.001,0.000,-0.003,0.001,-0.001,-0.000,...,-0.001,-0.0,-0.001,-0.000,0.001,0.001,0.000,-0.001,-0.001,0.002
"moment, sense, half, vote, able, identify, gt, annoying, instance, user",0.002,-0.001,-0.000,0.001,-0.001,-0.001,0.001,-0.000,0.001,-0.001,...,-0.001,-0.0,0.001,0.001,0.000,0.000,-0.000,-0.000,-0.000,-0.000
"inside, strongly, sort, real, end, sensor, wake, rock, subject, engineering",0.000,-0.003,-0.001,0.000,0.000,-0.001,0.001,0.000,-0.002,-0.001,...,0.002,0.0,-0.001,0.001,0.001,0.001,0.000,-0.000,-0.000,0.002
"finish, gt, issue, christmas, pain, friendship, thought, leave, active, facebook",0.002,0.001,-0.002,0.000,-0.001,0.000,0.002,0.001,0.001,0.000,...,-0.001,-0.0,-0.000,0.000,0.000,0.000,0.001,-0.000,-0.000,0.003


The Vt matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. In this case:
- The first four documents seem to be about thinking
- The last three documents seem to be about feeling

In [67]:
X_test_topic_array = lsa.transform(pd.DataFrame(X_test_tfidf1.toarray(), columns=tfidf1.get_feature_names()))

## Organize Topic DataFrames

In [68]:
X_train_topics = pd.DataFrame(doc_topic.round(5),
             index = X_train.index,
             columns = topics)
X_test_topics = pd.DataFrame(X_test_topic_array.round(5),
             index = X_test.index,
             columns = topics)
X_train_topics

Unnamed: 0,"like, think, people, know, feel, thing, time, good, type, want","type, function, fe, fi, ni, ne, ti, entp, intj, se","enfp, infp, infj, enfj, love, friend, feel, lol, like, isfp","tapatalk, send, iphone, use, gt, sm, tmlt, welcome, nexus, ipad","welcome, entp, intp, intj, forum, enfp, hello, perc, istj, xd","welcome, forum, hello, infj, thank, perc, function, hi, infp, hope","infj, intj, relationship, entp, date, thank, welcome, understand, infjs, enfj","lol, think, like, haha, xd, yeah, thank, entp, mean, enfp","infp, intj, intp, think, infps, friend, istj, tapatalk, people, esfj","enfp, sx, sp, type, enneagram, istj, entj, relationship, work, thank",...,"iam, quick, awesome, run, fellow, express, letter, shy, deeply, nope","check, power, area, logical, different, score, future, neutral, mention, role","potential, bother, effort, gift, pain, focus, drive, cook, list, actual","important, opinion, score, hahaha, difficult, home, form, far, develop, bored","user, blush, intuition, dress, cake, drive, depression, identify, hard, history","express, follow, edit, hell, stop, morning, shock, language, research, simple","moment, sense, half, vote, able, identify, gt, annoying, instance, user","inside, strongly, sort, real, end, sensor, wake, rock, subject, engineering","finish, gt, issue, christmas, pain, friendship, thought, leave, active, facebook","save, level, particularly, super, behaviour, dance, stereotype, worry, coffee, matter"
1632,0.33509,-0.02789,0.01374,-0.01772,-0.05589,0.04988,0.01129,-0.00075,0.02995,0.03991,...,0.00424,-0.03481,0.00772,-0.03880,0.01299,-0.00316,0.00205,-0.00907,-0.00329,0.02663
1454,0.23359,0.04501,-0.12579,0.01388,0.03814,-0.02376,0.00932,-0.01859,-0.04843,-0.00493,...,0.02604,0.03013,-0.00526,-0.00978,-0.00882,0.01288,-0.01089,0.04412,-0.01235,0.00211
5949,0.32905,-0.08590,0.00430,-0.01083,-0.00184,0.00258,-0.03298,-0.06280,0.01177,0.01727,...,0.00809,-0.00024,0.00308,0.02908,-0.04364,0.03678,0.02714,0.01154,0.00812,-0.01831
1077,0.38754,0.12391,-0.05474,-0.00946,0.04634,-0.08752,0.02376,0.11678,-0.07013,0.03260,...,-0.01081,-0.01785,0.00202,0.00434,-0.00571,0.01094,-0.02400,-0.00269,-0.03544,0.00425
5096,0.40410,-0.04025,0.10875,-0.02923,-0.03069,-0.03457,0.00227,0.05713,0.03230,0.07200,...,0.00416,0.02710,-0.00628,-0.04538,-0.01627,-0.02325,-0.00452,-0.00460,0.01042,-0.00135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5668,0.17376,-0.06727,0.05779,0.00205,0.04291,0.08771,-0.04356,0.02525,-0.01876,-0.00000,...,-0.01503,0.01384,0.00421,0.00166,-0.00266,-0.00487,-0.00709,0.00183,-0.00195,-0.01157
324,0.43893,0.05238,0.09259,-0.01405,0.04851,-0.01695,0.25469,-0.08602,0.00502,-0.02607,...,-0.00256,-0.02829,-0.01164,-0.00916,-0.01105,0.01448,0.00756,-0.02719,-0.01687,0.02756
3157,0.35827,0.00974,0.05847,-0.01926,-0.03613,-0.02390,-0.02308,0.00431,-0.01926,0.00455,...,-0.00550,-0.00975,-0.00971,-0.00056,0.01013,0.00950,0.02532,0.03280,-0.01692,0.00030
5994,0.36705,0.06364,-0.00285,0.02620,0.07068,0.08518,0.08771,0.04806,-0.07806,0.03730,...,0.02460,0.01209,-0.06150,0.00720,-0.01041,-0.02538,0.01846,0.03069,-0.01412,-0.00365


In [72]:
X_test_topics

Unnamed: 0,"angry, anger, upset, sad, hate, annoyed, calm, mad, frustrated, fear","ne, ni, fi, se, te, ti, si, fe, inferior, function","esfp, esfj, isfp, estp, estj, isfj, istj, entj, enfj, istp","science, math, physics, study, psychology, studying, university, major, engineering, student","hey, ya, kinda, lol, dude, ok, cool, wanna, guys, gotta","welcome, hello, welcome forum, fellow, forum, perc, cafe, hope, personality cafe, enjoy","heart, beautiful, eyes, soul, inside, deep, light, pain, rain, sun","dating, relationships, date, relationship, boyfriend, dated, ex, romantic, wants, married","anxiety, depression, social anxiety, disorder, social, depressed, anxious, diagnosed, bipolar, therapy","job, working, jobs, business, worked, office, career, money, city, field",...,"makes sense, sense, make sense, makes, doesnt make, does make, make, wouldnt, difficult, doesnt","infjs, infj, im infj, enfj, fe, enfjs, ni, similar, able, experience","worry, dont worry, fine, like think, ill just, rest, lately, awesome, matter, suddenly","spend, spend time, lot time, spent, plan, subject, prefer, enjoy, effort, talking","leave, wait, room, home, stay, minutes, away, knows, sit, leaving","happened, knew, told, wanted, asked, came, saw, gave, looked, wasnt","waste, waste time, stupid, purpose, worse, eventually, rarely, word, anymore, pointless","free, feel free, wikipedia, everyday, freedom, add, feel, wish, instead, pc","follow, rules, rule, following, code, damn, break, impossible, istjs, needed","afraid, im afraid, fear, forget, reality, later, hate, help, willing, change"
7048,0.00003,0.00000,0.00027,0.00000,0.00000,0.00057,0.00026,0.00000,0.00138,0.00000,...,0.02158,0.00000,0.00000,0.01832,0.00146,0.02445,0.00000,0.00000,0.01286,0.02020
4211,0.00096,0.00088,0.00176,0.00000,0.00000,0.00058,0.00033,0.00183,0.00007,0.00064,...,0.00000,0.00227,0.00133,0.01438,0.01208,0.00585,0.00257,0.00035,0.00104,0.03525
6663,0.00000,0.00070,0.00000,0.00003,0.00000,0.00027,0.00130,0.00054,0.00022,0.00067,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.01480,0.00000,0.00019,0.04021,0.00000
3835,0.00145,0.00000,0.00139,0.00008,0.00081,0.00058,0.00021,0.00181,0.00001,0.00069,...,0.00000,0.01595,0.00093,0.00000,0.00000,0.00120,0.00174,0.02083,0.00366,0.00051
8608,0.00000,0.00380,0.00473,0.00000,0.00250,0.00000,0.00019,0.00101,0.00000,0.00080,...,0.00000,0.00000,0.03398,0.01552,0.02105,0.02077,0.00000,0.02064,0.00000,0.00145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4609,0.00012,0.00042,0.00037,0.00012,0.00292,0.00038,0.00034,0.00088,0.00110,0.00105,...,0.00663,0.01490,0.00000,0.03036,0.00229,0.01067,0.00000,0.00000,0.02279,0.00000
3217,0.00000,0.00000,0.00354,0.00000,0.00326,0.00003,0.00000,0.00190,0.00083,0.00000,...,0.02730,0.00949,0.00369,0.00000,0.00103,0.01452,0.00000,0.00000,0.00000,0.00000
705,0.00021,0.00000,0.00000,0.00000,0.00307,0.00000,0.00008,0.00185,0.00000,0.00156,...,0.00004,0.01694,0.00000,0.00000,0.02333,0.00474,0.00000,0.00000,0.01882,0.00276
1872,0.00154,0.00000,0.00034,0.00004,0.00333,0.00001,0.00025,0.00298,0.00000,0.00197,...,0.00000,0.00000,0.03740,0.00000,0.00827,0.00000,0.00049,0.00000,0.00000,0.00000


# Export as csv's

In [73]:
X_train_topics.to_csv('X_train_topics.csv')
X_test_topics.to_csv('X_test_topics.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')