In [82]:
library(tidyverse)
library(magrittr)
library(text2vec)
library(tokenizers)
library(doParallel)
library(tm)
library(caret)
library(stringr)
registerDoParallel(4)

In [4]:
opinions <- read.csv("opinions_sample.csv", header=TRUE, sep=";")

In [5]:
print(dim(opinions))
print(head(opinions))
print(names(opinions))

[1] 5000   17
                                 absolute_url
1    /opinion/143119/berwick-v-united-states/
2    /opinion/122028/morales-v-united-states/
3               /opinion/92451/calton-v-utah/
4      /opinion/134300/samson-v-lewis-warden/
5 /opinion/145692/wachovia-bank-na-v-schmidt/
6              /opinion/89793/hurley-v-jones/
                                                 author author_str
1                                                               NA
2                                                               NA
3 http://www.courtlistener.com/api/rest/v3/people/1366/         NA
4                                                               NA
5 http://www.courtlistener.com/api/rest/v3/people/1213/         NA
6 http://www.courtlistener.com/api/rest/v3/people/3338/         NA
                                                    cluster
1 http://www.courtlistener.com/api/rest/v3/clusters/143119/
2 http://www.courtlistener.com/api/rest/v3/clusters/122028/
3  http://www.c

 [1] "absolute_url"        "author"              "author_str"         
 [4] "cluster"             "date_created"        "date_modified"      
 [7] "download_url"        "extracted_by_ocr"    "html"               
[10] "html_lawbox"         "html_with_citations" "joined_by"          
[13] "local_path"          "opinions_cited"      "per_curiam"         
[16] "plain_text"          "resource_uri"       


In [103]:
#Data cleaning pipeline
clean_tokens <- opinions %$%
  str_to_lower(plain_text) %>%
  str_replace_all("[^[:alpha:]]", " ") %>%  #Removing numbers and symbols from the text
  str_replace_all("\\s+", " ") %>%
  gsub("\\b\\w{1}\\s","", .) %>% #Remove words with lengh less than 2
  itoken(tokenizer = tokenize_word_stems)

In [104]:
#TODO -- Create a stopword list which appear to be useless
#Voc pipeline
vocab <- create_vocabulary(clean_tokens, stopwords = stopwords("en")) %>%
  prune_vocabulary(term_count_min = 3, term_count_max = 5000, doc_proportion_max = 0.7, vocab_term_max = 20000)
vectorize <- vocab_vectorizer(vocab)

In [105]:
summary(vocab)
print(vocab$term[1:40])

     term             term_count        doc_count    
 Length:8950        Min.   :   3.00   Min.   :  1.0  
 Class :character   1st Qu.:   5.00   1st Qu.:  3.0  
 Mode  :character   Median :  11.00   Median :  5.0  
                    Mean   :  66.83   Mean   : 14.8  
                    3rd Qu.:  40.00   3rd Qu.: 14.0  
                    Max.   :4086.00   Max.   :148.0  

 [1] "unit"      "case"      "opinion"   "law"       "feder"     "ani"      
 [7] "id"        "act"       "claim"     "may"       "rule"      "requir"   
[13] "govern"    "cite"      "congress"  "statut"    "dissent"   "district" 
[19] "becaus"    "one"       "doe"       "onli"      "use"       "appeal"   
[25] "also"      "reason"    "whether"   "petition"  "public"    "right"    
[31] "appli"     "evid"      "judgment"  "constitut" "provid"    "order"    
[37] "defend"    "must"      "decis"     "tion"     


In [106]:
tfidf_model <- TfIdf$new(smooth_idf = TRUE, norm = c('l2'), sublinear_tf = FALSE) #Using L2 lasso descent gradiant

In [107]:
dtm <- create_dtm(clean_tokens, vectorizer)
#i equal the index of the term in the vectorizer
#j equal the index of the document
#x equal occurence of that term

In [108]:
tfidf_dtm <- tfidf_model$fit_transform(dtm)
#tfidf weight of the previous DTM

In [109]:
print(dim(tfidf_dtm))

[1] 5000 8978
