# Topic Modeling Discrimination  
### Table of Contents  
[Import](#Import)  
[Functions](#Functions)  
[Data Processing](#Data-Processing)  
[Visualize pyLDAvis](#Visualize-pyLDAvis)

## Import  
[Table of Contents](#Table-of-Contents)

In [49]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import pyLDAvis.sklearn
import pyLDAvis

from nltk.corpus import stopwords
import copy

In [5]:
df=pd.read_csv('../Data_Prep/AVENCensus2016_data.csv')
df=df.drop(['Unnamed: 0'],1)
df.head(2)

Unnamed: 0,timestamp,year,month,education,student,religion,religionOther,raceDesc,nationality,USRegion,...,notPartNaN,notPartNervous,notPartNoLocal,notPartNotInt,notPartRacism,notPartSocAnxiety,notPartNotAccDisab,notPartUnfriendToLGBTQ,notPartUnfriendToGreyDemi,notPartElitistNonAces
0,2016-10-23 18:57:01,1995.0,July,Some college (no degree yet),"Yes, in an undergraduate program",Agnostic,,White,United States of America,Florida,...,0,0,1,1,0,0,0,0,1,0
1,2016-10-23 19:35:35,1998.0,September,Less than high school,"Yes, in high school",Roman Catholic,,White American,United Kingdom,,...,0,1,0,0,0,0,0,0,0,0


## Data Processing  
[Table of Contents](#Table-of-Contents)

In [54]:
my_stops = stopwords.words('english')
my_stops+=['?','!','.',',',':',';',]

In [25]:
def processing(data, col_num):
#     data = data.dropna(subset=[data.columns[col_num]])
    data = data.dropna(subset=[col_num])
    data = data.ix[:,col_num]
#     data2 = data
    data.tolist()
    data = [x.lower() for x in data]
    return data

## Visualize pyLDAvis  
[Table of Contents](#Table-of-Contents)

### Discrimination

In [None]:
discrim = processing(df, 'discrimLong')
discrim_stops = copy.deepcopy(my_stops)
discrim_stops+=[';']
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = discrim_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))

In [68]:
mat = tf_vectorizer.fit_transform(discrim).toarray()
words = ['']*len(tf_vectorizer.vocabulary_)
for k,v in tf_vectorizer.vocabulary_.items():
    words[v] = k
print(words)


['able avoid', 'able find', 'able handle', 'able help', 'able say', 'abuse harassment', 'abuse sexual', 'abuse verbal', 'abused asexual', 'abused child', 'accept asexuality', 'accused lesbian', 'ace anymore', 'ace aro', 'ace aro people', 'ace communities', 'ace community', 'ace could', 'ace demi', 'ace discourse', 'ace discourse asexual', 'ace discourse tumblr', 'ace discoursers', 'ace enough', 'ace erasure', 'ace exclusionary', 'ace experienced', 'ace friendly', 'ace friends', 'ace gay', 'ace harassed', 'ace hate', 'ace homoromantic', 'ace identities', 'ace identity', 'ace inclusion', 'ace inclusive', 'ace lesbian', 'ace like', 'ace made', 'ace never', 'ace non', 'ace non ace', 'ace one', 'ace part', 'ace partners', 'ace people', 'ace people belong', 'ace people exist', 'ace people get', 'ace people people', 'ace people talk', 'ace people want', 'ace person', 'ace really', 'ace recently', 'ace said', 'ace several', 'ace sex', 'ace spaces', 'ace spec', 'ace spectrum', 'ace still', 'ace

In [83]:
test = pd.DataFrame(mat)
test.columns = words
word_counts = pd.DataFrame(list(zip(test.columns, test.sum(axis=1))))
best_words = word_counts[word_counts[1]>20]
best_words.columns = ['words', 'counts']
best_words
best_words.to_csv('best_words.csv', index=False)

In [55]:
dtm_tf = tf_vectorizer.fit_transform(discrim)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch', learning_offset=50.,)


lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [56]:
outLong = processing(df, 'outResponceLong')
out_stops = copy.deepcopy(my_stops)
out_stops+=[';']
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = out_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))
dtm_tf = tf_vectorizer.fit_transform(outLong)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch')
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [57]:
stories = processing(df, 'stories')
stories_stops = copy.deepcopy(my_stops)
stories_stops+=[';']
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = stories_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))
dtm_tf = tf_vectorizer.fit_transform(stories)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch')
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [62]:
questions = processing(df, 'questions')
quest_stops = copy.deepcopy(my_stops)
quest_stops+=['would',  'like','know', 'like know', 'would love', 'really want', 'would like know', 'want know many', ]
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = quest_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))
dtm_tf = tf_vectorizer.fit_transform(questions)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch')
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [85]:
aceSupportLong = processing(df, 'aceSupportLong')
aceSupp_stops = copy.deepcopy(my_stops)
aceSupp_stops+=['would like', 'would', 'like', 'ace', 'asexual']
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = aceSupp_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))
dtm_tf = tf_vectorizer.fit_transform(aceSupportLong)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch')
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [67]:
allySupportLong = processing(df, 'allySupportLong')
allySupp_stops = copy.deepcopy(my_stops)
allySupp_stops+=['non ace people', 'allies']
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                               stop_words = allySupp_stops,
                               lowercase = True,
                               token_pattern = r'\b[a-zA-Z]{3,}\b',
                               max_df = 0.5,
                               min_df = 2,
                               ngram_range = (2,3))
dtm_tf = tf_vectorizer.fit_transform(allySupportLong)
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0, n_jobs = -1, learning_method='batch')
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)