In [38]:
import json
import csv
import pandas as pd
import numpy as np
import nltk

import string
import re
import gensim

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)



In [None]:
#Brining in arxiv metadata as CSV
#df = pd.read_csv("/Users/aster/Desktop/Fall 2020/ML/final_project/data/arxiv_metadata.csv")

In [None]:
#df.head()

In [39]:
#Setting up Paths for rest of Notebook
import os
import os.path

#'uncleaned' subset of from arxiv metadata snapshot. ML(cs.lg) and or AI(cs.ai) category tagged papers with "fair" in the title
#/Users/aster/Desktop/fall_2020/ML/final_project/pratt_ml_final_project/data/arxiv_corpus_2020_11_23.csv
snapshot = os.path.join("/Users/aster/Desktop/fall_2020/ML/final_project/data/arxiv_corpus_2020_11_23.csv")

#data after cleaning, lemmatization, bigrams, tokenization
dataset = os.path.join("/Users/aster/Desktop/fall_2020/ML/final_project/data/arxiv_corpus_2020_12_8.csv")

### Data Subset Selection

Now that we have the arxiv metadata snapshot as a csv we can make 2 corpus, one with abstract data and one with title; which we'll preprocess and fit our model. For this project we're particularly interested in how machine learning practitioners are thinking and talking about "fairness". We're trying to see if we can identify types of approaches to fairness in the form of subjects. 

Our data subset will include papers: 
- With "fair" in the title
- With machine learning (cs.LG) and artificial intelligence (cs.AI) in the categories column

In this part of the project required loading the arxiv metadata snaphot which was rather large. I've commented it out here. 

In [None]:
#Papers that have fair in title
#df=df[df['title'].str.lower().str.contains("fair")]


In [None]:
#Papers including fair in the title that are in our categories of interest
df = df[df["categories"].str.lower().str.contains("cs.ai", "cs.lg")]
df.shape

In [None]:
#Writing subset to a csv file in the data folder
df = pd.read_csv(snapshot, encoding = 'utf8')
#df.head()

In [None]:
#using a lambda function to apply str(ing).lower(case) to the abstract and title column. 
#Also making new columns for the "cleaned" abstract and title data. This way we hold unto our original data.
#I'm using a lambda function because I need to apply this to multiple columns
df[["abstract_clean", "title_clean"]] = df[["abstract", "title"]].apply(lambda x: x.str.lower())

## Data Cleaning: 

Now that we have our subset data in one place we can move unto cleaning. 

- [x] I'd like to get rid of puntuations.  
- [x] I'd like to get rid of "/n"
- [x] I'd like to get rid of numbers


In [None]:
import re
import string

#creating a variable to hold all the string patterns we want to operate on with regex
# "|" pipe is an "or" joing the pattern for:
#punctuation "[^\w\s]"
#new line marker "\n"
#and numerals "\d+"
#then apply .str.replace via lambda function
pattern = '|'.join(["[^\w\s]", "\n", "\d+", "[‘’“”…]"])

df[['title_clean','abstract_clean']] = df[['title_clean','abstract_clean']].apply(lambda x: x.str.replace(pattern, ' '))
#df.head()


In [None]:
#Leading and Trailing White Space
df[['title_clean','abstract_clean']] = df[['title_clean','abstract_clean']].apply(lambda x: x.str.strip())
#df.head()

## Further Cleaning - More Normalization and Tokenization

Now that we have our corpus clean we can had into preprocessing for our model. 
Our Preprocessing Tasks are: 

- [x] Stop words
- [x] Stemming/Lemmatization of words
- [x] Tokenizing words - Creating Document Term Matrix (otherwise known as bag of words)
- [] Bigrams

In [None]:
### Stemming/Lemmatization 
#nltk.download('wordnet')
import nltk
#nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phraser


t = nltk.tokenize.WhitespaceTokenizer()
lem = nltk.stem.WordNetLemmatizer()

#this function will tokenize AND THEN lemmatize values in a column
def lem_text(text):
    return [lem.lemmatize(w, 'v') for w in t.tokenize(text)]

#df = pd.DataFrame([‘this was cheesy blessing’, 'she likes these books ', ‘wow this is great amazing’], columns=[‘text’])
#print(df)

df["title_clean"] = df["title_clean"].apply(lem_text)
df["abstract_clean"] = df["abstract_clean"].apply(lem_text)

#Now we have tokenized and lemmed columns of abstract adn title data, 

In [None]:
#df.head()

In [None]:
#Getting rid of Stop Words
from nltk.corpus import stopwords

stop=stopwords.words("english")

df["abstract_clean"]= df['abstract_clean'].apply(lambda x: [item for item in x if item not in stop])
df['title_clean'] = df['title_clean'].apply(lambda x: [item for item in x if item not in stop])
# Strangely I can't group these funtion together or it doesn't seem to work on both columns i'm tryin to select. 

In [None]:
#df.head()

### N-grams: Bigrams, Trigrams, Probability

In computational lingusitics/natural language processing an "n-gram" describes a sequence of n-items in a collection of text. 

So in the sentence:

"Machine Learning should reckon with fairness, justice, and equity."

"Machine" - 1-gram (unigram)

"Machine, Learning" - 2-gram (bigram)

"Machine, Learning, should" - 3-gram (trigram)

Machine, Learning, should, reckon" - 4-(quadgram)

etc. 

Currently the text we're looking at is tokenized as unigrams. So now we're going to try using bigrams and trigrams and see what sort of new results we can get. 


In [None]:
###Setting Bigrams
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser

def bigrams(text):
    bigram = Phrases(text, min_count=1)
    bigram_mod = Phraser(bigram)
    return [bigram_mod[[x]] for x in text]

df['title_clean']=df['title_clean'].apply(lambda x: (x))
df['title_clean']=df[['title_clean']].apply(bigrams)


In [None]:
#df.head()

In [None]:
#Setting bigrams for the abstract data. We'll primarily be looking at abstracts for our topics.
df['abstract_clean']=df['abstract_clean'].apply(lambda x: (x))
df['abstract_clean']=df[['abstract_clean']].apply(bigrams)


In [None]:
#Was just checking to see if my lambda function was applied. It was but it's returning a generator object
#I'll have to transform my text data back into a list. 
#df['abstract_clean'][1]

In [None]:
#list(df['abstract_clean'][1])

In [None]:
df["title_clean"] = df["title_clean"].apply(lambda x: list(x)[0])

In [None]:
df["abstract_clean"] = df["abstract_clean"].apply(lambda x: list(x)[0])

In [None]:
#Checking the 2nd paper's title to make sure it's a list of unigram/bigrams and not a generator object
#df["title_clean"][1]

In [40]:
#Rewriting df variable with our cleaned dataset. 
df= pd.read_csv(dataset)

In [None]:
#df.head()

### Exploratory Data Visualization

In [None]:
### Word Cloud Generation Just to see:

# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
#all_title = ','.join(list(df["title_clean"].values))

# Create a Title WordCloud object
cloud=WordCloud(background_color="black", max_words=1000)
                       
# Generate a word cloud
cloud.generate(df["title_clean"].to_string())

# Visualize the word cloud
cloud.to_image()


In [None]:
# Create an Abstract WordCloud object
cloud=WordCloud(background_color="black", max_words=1000)
                       
# Generate a word cloud
cloud.generate(df["abstract_clean"].to_string())

# Visualize the word cloud
cloud.to_image()


There are a couple of intersting things we can begin to think about and question with this simple visualization. 

- We see words we expect to see like "fairness" and "fair"
    - interstingly "fair" and "fairness" weren't lemmed to "fair" but we might consider them representing different uses/understandings 
        - where and how are we using "fairness" as opposed to "fair"?
    - Also "unfairness" is interestingly small when compared to fair and fairness. SO those seem to be used at the same amounts (possibly even in similar functions/interchangeably?) but "unfairness"is beng used differently somewhat. 
    
- The largest used terms are apparently functional terms liek "model" "system" "classification" etc. 
- It's interesting that words like "social" and "representation" have low occurence. 


    

## Topic Modelling - Latent Dirlecht Allocation (LDA)

Topic Modeling: A statistical modeling of the abstract topics that occur within a document (i.e. topics are "latent" within the document). In topic modeling we can understand a document as not only a collection of text but a distribution of "topic". 

Latent Dirlecht Allocation (LDA) is one type of model used in Topic Modelling. Our LDA model will build "topics" identify as cluster of words/tokens and then look at the distribution of topics throughout our documents. 

Our tasks in this section will be:
- Preprocessing 
    - We'll preparing our data by creating a **Bag of Words** representation of our cleaned text data
        
            - the bag of words will require us to:
                1. Make dictionary with all of the words in our corpus, numbered (and perhaps in alphabetical order)
                2. A count of how often each word appears in each document
                
- Model Fitting:
    - We'll build a pipeline 
Here we'll build a pipeline for fitting the LDA model. 

Returning to dictionary buildig let's look at our abstract data. Our LDA will require 3 things:

1. a dictionary - this will hold all the possible words (and in this second trial bi-grams) in our text with an id. 

2. a corpus - this is our colecltion of text, in this case the abstract_clean column

3. a Count of our term-document frequency. So the count of how many times a term (word or bigram) appears in a document (row of data)


In [None]:
#Making a Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer

cv= CountVectorizer(stop_words="english")

title_cv = cv.fit_transform(df['title_clean'])

dtm = pd.DataFrame(title_cv.toarray(), columns=cv.get_feature_names())
dtm.index = df.index

#Here is our document-term matrix 
dtm.head()

In [None]:
#Apparently Gensim requires a term document matrix, where the index is our terms and the columns are documents.
tdm=dtm.transpose()
tdm.head()

In [None]:
df["title_clean"][1]

#### Working with Title Data:

In [45]:
from gensim.utils import simple_preprocess


def preprocess(text):
   result = []
   for token in gensim.utils.simple_preprocess(text):
      #if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
       if len(token) > 1: 
         result.append(token)
   return result

processed = df["title_clean"].map(preprocess)


dictionary = gensim.corpora.Dictionary(processed)


In [46]:
from gensim import matutils, models
import scipy.sparse

#transposing our document term matrix to a form gensim accepts
#so wher before x axis was a document and y axis was a word, these have been switched
tdm=dtm.transpose()

sparse_counts = scipy.sparse.csr_matrix(dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [47]:
#texts = [[word for word in df["title_clean"].split()]] 
#for document in documents]
#dictionary = corpora.Dictionary(texts)
#corpus = [dictionary.doc2bow(text) for text in texts]



#title_dict = gensim.corpora.Dictionary(df["title_clean"])

#creating a bag of words of just title information
#title_corpus = [title_dict.doc2bow(x) for x in df["title_clean"]]
#title_corpus = [title_dict.doc2bow(x) for x in texts]

#Instantiating LDA model
lda_t = gensim.models.ldamodel.LdaModel(id2word=dictionary, corpus=corpus, num_topics=5, passes=10)

#Putting our Title topics in a dataframe
title_topics = pd.DataFrame(lda_t.print_topics(num_words=10))
title_topics.columns= ["topics", "terms"]
title_topics

Unnamed: 0,topics,terms
0,0,"0.023*""explainable"" + 0.023*""reframing"" + 0.022*""un"" + 0.021*""kidney"" + 0.020*""diversity"" + 0.020*""groupwise"" + 0.019*""service"" + 0.018*""model"" + 0.016*""aggregation"" + 0.015*""optimality"""
1,1,"0.030*""items"" + 0.028*""intersectional"" + 0.023*""measurement"" + 0.022*""ordinal"" + 0.020*""food"" + 0.020*""mechanism"" + 0.020*""application"" + 0.018*""aware_machine"" + 0.018*""problem"" + 0.017*""outcome"""
2,2,"0.026*""toolkit"" + 0.026*""cut"" + 0.026*""query"" + 0.023*""veil"" + 0.021*""differences"" + 0.020*""potentially"" + 0.019*""term"" + 0.019*""vote"" + 0.019*""lecture"" + 0.017*""product"""
3,3,"0.025*""properties"" + 0.021*""supervise_learn"" + 0.020*""expectation"" + 0.020*""automate"" + 0.019*""extensible"" + 0.019*""fairmod"" + 0.018*""everyday"" + 0.017*""causal_model"" + 0.017*""deep"" + 0.017*""crowdsourcing"""
4,4,"0.024*""perspective"" + 0.023*""public"" + 0.021*""apply"" + 0.021*""behind"" + 0.021*""value"" + 0.019*""aequitas"" + 0.018*""social"" + 0.018*""aware"" + 0.018*""decision_make"" + 0.018*""transportation"""


In [None]:
df["title_clean"][1]

#### Working with Abstract Data

In [None]:
#df.head()

In [48]:
#Now doing everything we just did but for abstract data:

#First getting a document term matrix and setting a sparse matrix corpus
#cv = CountVectorizer(stop_words="english")

ab_cv = cv.fit_transform(df['abstract_clean'])

ab_dtm = pd.DataFrame(ab_cv.toarray(), columns=cv.get_feature_names())
ab_dtm.index = df.index

In [49]:
#Next creating a sparse matrix corpus
s_counts = scipy.sparse.csr_matrix(ab_dtm)
ab_corpus = matutils.Sparse2Corpus(s_counts)

In [50]:
#Creating our dictionary of tokens with ids
def preprocess(text):
   result = []
   for token in gensim.utils.simple_preprocess(text):
      
       if len(token) > 1: 
         result.append(token)
   return result

abstracts = df["abstract_clean"].map(preprocess)

ab_dictionary = gensim.corpora.Dictionary(processed)


In [51]:
#Finally instantiating LDA model and fitting to our abstract data. 
lda = gensim.models.ldamodel.LdaModel(id2word=ab_dictionary, corpus=ab_corpus, num_topics=3, passes=30)

#Putting our Title topics in a dataframe
abstract_topics = pd.DataFrame(lda.print_topics(num_words=10))
abstract_topics.columns= ["topics", "terms"]
abstract_topics

Unnamed: 0,topics,terms
0,0,"0.028*""embrace"" + 0.027*""commerce"" + 0.022*""application"" + 0.022*""allocate"" + 0.021*""responsible"" + 0.020*""service"" + 0.019*""judgement"" + 0.019*""cut"" + 0.018*""dominant"" + 0.018*""max"""
1,1,"0.031*""deep"" + 0.022*""causal_model"" + 0.021*""toolkit"" + 0.020*""welfare"" + 0.020*""public"" + 0.020*""potentially"" + 0.017*""properties"" + 0.017*""tolerant"" + 0.016*""metrics"" + 0.016*""recommendation"""
2,2,"0.034*""intersectional"" + 0.031*""critically"" + 0.029*""social"" + 0.026*""attitudes"" + 0.025*""un"" + 0.022*""analysis"" + 0.022*""nash"" + 0.022*""predictive"" + 0.022*""concentration"" + 0.020*""software"""


In [52]:
#Saving my 3 topics as a csv
#abstract_topics.to_csv("/Users/aster/Desktop/fall_2020/ML/final_project/pratt_ml_final_project/data/abstract_topics.csv")

#Saving my abstract topics model
lda.save("abstracts.model")

In [54]:
#Importing pyLDAvis libraries
import pyLDAvis
import pyLDAvis.gensim

model = models.LdaModel.load("abstracts.model")
#instantiating pyLDAvis and setting to variable
vis = pyLDAvis.gensim.prepare(topic_model= model, 
                              corpus=ab_corpus, 
                              dictionary=ab_dictionary)

pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)
