<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preprocessing</a></span></li><li><span><a href="#Extracting-tweets-from-SQL-database" data-toc-modified-id="Extracting-tweets-from-SQL-database-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Extracting tweets from SQL database</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Word2vec" data-toc-modified-id="Word2vec-3.0.1"><span class="toc-item-num">3.0.1&nbsp;&nbsp;</span>Word2vec</a></span></li></ul></li></ul></li><li><span><a href="#Topic-Modeling" data-toc-modified-id="Topic-Modeling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Topic Modeling</a></span></li></ul></div>

# Libraries 

    The following Libraries were used in the development of this project:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
import nltk
import pyodbc
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from string import punctuation
import collections
from collections import Counter
import json 

import nltk 


'''Natural Language Processing libraries'''
import nltk 
import gensim
import regex as re
import spacy
from spacy.lang.en import English
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim.downloader as api
import re, string, unicodedata
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from gensim.models import FastText
from gensim.models import Word2Vec

import preprocessor as p

import warnings
warnings.filterwarnings("ignore")

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# Plotting tools

import matplotlib.pyplot as plt

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\afabi/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\afabi/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\afabi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\afabi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\afabi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Preprocessing

The data preprocessing consist of different stages:

In [2]:
class Document:
    """ Retrieve the narratives from the DataFrame and respectively
        store and pre-process it. 
        
        :param df: DataFrame including the reports and the predictor variable. 
        
        
        :ivar data: Stores the DataFrame.
        :ivar text: Stores the narratives as string.
        :ivar corpus: Stores the pre-processed text.
    """
    
    
    def __init__(self, df):
        self.data = df
        self.text = df["text"].astype(str)
        self.textPreProcessing()
        
        
    def remove_non_ascii(self, words):
        """Remove non-ASCII characters from list of tokenized words
        
        :param words:  List of words to be transformed when removing non_ascii characters.
        
        :return new_words: List of words after the transformation of removed non_ascii characters.
        
        """
        new_words = []
        for word in words:
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_words.append(new_word)
        return new_words


    def remove_punctuation(self, words):
        """Remove punctuation from list of tokenized words
        
        :param words:  List of words that will get remove their punctuations, if any. 
        
        :return new_words: List of transformed words.
        
        
        """
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words


    def stem_words(self, words):
        """Stem words in list of tokenized words
        
        :param words:  List of words to be processed. 
        
        :return new_words: List of the received words respective stems.
        
        
        """
        
        stemmer = LancasterStemmer()
        stems = []
        for word in words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems
    
    def lemmatize_verbs(self, words):
        """Lemmatize verbs in list of tokenized words
        
        :param words:  List of words to be processed. 
        
        :return new_words: List of the received words respective lemmas.
        
        """
        
        lemmatizer = WordNetLemmatizer()
        lemmas = []
        for word in words:
            lemma = lemmatizer.lemmatize(word, pos='v')
            lemmas.append(lemma)
        return lemmas
    
    
    
    def remove_stopwords(self, words):
        """Remove common words that have no meaning or importance in the sentence.

        :param words:  List of words to be processed and get stop words removed.. 

        :return new_words: List of words with the stop words already removed."""
            
        
        stop_words = set(stopwords.words('spanish')) 
        stop_words1 = set(stopwords.words('english')) 


        
        for word in stop_words:
            if word in words:
                words.remove(word)
                
        for word in stop_words1:
            if word in words:
                words.remove(word)
                
        return words


    
    def normalize(self, words):
        words = self.remove_non_ascii(words)
        words = self.remove_stopwords(words)
        words = self.remove_punctuation(words)
        words = self.lemmatize_verbs(words)
        return words
    
    
    def textPreProcessing(self):
        """Pre-process the text, normalize and clean it.
        The function stores the cleaned text in the self.data
        attribute. """
        p.set_options(p.OPT.URL,p.OPT.MENTION,p.OPT.HASHTAG,p.OPT.RESERVED,p.OPT.SMILEY,p.OPT.NUMBER)


        clean_text = []

        for narrative in self.text:
            sentence = p.clean(narrative)
            #sentence = re.sub('RT @[\w_]+:', '', sentence)

            #sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence)
            sentence = word_tokenize(sentence)
            sentence = self.normalize(sentence)
                
                
            clean_text.append(sentence)
            
            
                    
        print(len(self.text), len(clean_text))
        self.data["clean_text"] = clean_text

In [3]:
def concatTweets(table):
    tweets = ''
    c = 0
    for row in df.tweets:
        if(c == 0):
            c+= 1
            tweets = pd.DataFrame(json.loads(row))
        else:
            tweets = pd.concat([tweets, pd.DataFrame(json.loads(row))], axis=0, ignore_index=True)
    
    
    return tweets 

In [40]:
import google.cloud 
from google.cloud import language_v1
from google.cloud.language_v1 import types


client = language_v1.LanguageServiceClient.from_service_account_json("C:\\Users\\afabi\\Downloads\\service-account.json")

def sentimentAnalysis(text, sentences):        
    document = language_v1.Document(content=text, type_=language_v1.Document.Type.HTML)

    annotations = client.analyze_sentiment(document=document,
                                       encoding_type='UTF32', timeout=600)


    for s in annotations.sentences:
        sentences.append((s.text.content, s.sentiment.score, s.sentiment.magnitude))
        
        
def concatCleanedText(cleaned_text, sentences):
    print("a")
    for tweet in cleaned_text:
        tuit = ''
        for word in tweet:
            tuit = tuit + ' ' + str(word)
        
        
        sentimentAnalysis(tuit, sentences)
        
    


In [41]:
def userSubSets(tweets_df):
    user = ''
    sentences = []   

    for instance in tweets_df.user:
        try:
            if(user != instance):
                print(instance)
                user = instance
                actual_user =  tweets_df['user']== user
                user_subset = tweets_df[actual_user]

                concatCleanedText(user_subset.clean_text, sentences)
            else:
                continue
        except:
            continue
            
            
            
    return sentences
            

            

def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)




def get_scores(group,tweets):
    scores = []
    for tweet in tweets:
        s = jaccard_similarity(group, tweet)
        scores.append(s)
    return scores


    
    
            
        
        
    

# Extracting tweets from SQL database

In [5]:
server = 'sqldatamining.database.windows.net'
database = 'SNA'
username = 'UserAdmin'
password = 'Machomen123'   
driver= '{ODBC Driver 13 for SQL Server}'

cnxn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER=sqldatamining.database.windows.net,1433', user='UserAdmin' , password='Machomen123', database='SNA')

In [7]:
cursor = cnxn.cursor()

In [8]:
rows = cursor.execute("SELECT author, created_at, location, description, followers, following, favourites_count, statuses_count, tweets from dbo.Users1 ").fetchall()

In [16]:
df = pd.DataFrame((tuple(t) for t in rows), columns=['author',
 'created_at',
 'location',
 'description',
 'followers',
 'following',
 'favourites_count',
 'statuses_count',
 'tweets'] )


In [44]:
df.head(40)

Unnamed: 0,index,author,created_at,location,description,followers,following,favourites_count,statuses_count,tweets
0,0,themoonisironic,2014-01-16 01:10:33,‘97 // they/them // ????????,"where there is hope, there are trials // art: ...",101,516,121166,36377,"[{""id"": 1356030533780824069, ""created_at"": ""20..."
1,1,bbelita23,2012-09-19 23:21:48,"spooky town, PR",just put it out into the universe,670,443,33428,92917,"[{""id"": 1356026926134140928, ""created_at"": ""20..."
2,3,CoraimaINegron,2013-08-20 10:48:59,Puerto Rico,"“El karma te lo devolverá todo, excepto a la m...",64,203,1336,1565,"[{""id"": 1283388114661244934, ""created_at"": ""20..."
4,8,SJCiudadCapital,2021-01-06 05:23:04,,Cuenta Oficial de la Ciudad Capital de Puerto ...,2915,183,291,1157,"[{""id"": 1355954464436473856, ""created_at"": ""20..."
5,9,Angelicv_G,2012-03-12 01:12:16,Puerto Rico,UT • yo no sé que poner aquí??,962,519,115665,130883,"[{""id"": 1356041673227313154, ""created_at"": ""20..."
6,12,__lfc7,2013-06-22 22:47:49,,UPRRP | ????| Metas claras ??,1194,758,9114,47354,"[{""id"": 1355634630569975814, ""created_at"": ""20..."
7,14,Enrique94494039,2020-02-14 15:04:19,,,1,57,229,177,"[{""id"": 1329582757916315648, ""created_at"": ""20..."
8,15,karywasabi13,2013-08-04 23:50:38,"Río Grande, Puerto Rico",801 | ig:karytza13 | Patria Nueva,5216,512,34401,52523,"[{""id"": 1356032271418085376, ""created_at"": ""20..."
9,16,ArekCell,2020-06-20 11:30:13,"Kota Madiun, Jawa Timur",Pejuang rupiah,74,930,242,262,"[{""id"": 1348920054830616582, ""created_at"": ""20..."
10,18,_sarielys_,2019-01-11 14:31:36,Puerto Rico,Si el corazón se aburre de querer para qué sirve,95,102,16430,10190,"[{""id"": 1355685064361848836, ""created_at"": ""20..."


In [18]:
index = 0
for tweets in df.tweets:
    print(len(tweets))
    if(len(tweets) == 2):
        df.drop([index], inplace=True, axis=0)
    
    index+=1
    
df.reset_index(inplace=True)

index = 0
temp_user = ""
for user in df.author:
    if(user != temp_user):
        print("a")
        temp_user = user 
    else:
        df.drop([index], inplace=True, axis=0)
    
    
    index+=1

435940
181519
2
34422
2
34422
2
2
68260
331733
2
2
826569
2
476
617388
60177
2
725156
763868
731587
494057
2
2
12645
498511
398221
2
542318
325
325
325
702299
685193
2
2
2
140497
261657
685293
724710
633249
292692
189022
77520
524238
555235
15128
554305
578829
160924
478081
65397
620540
822004
2
370254
2
732823
401562
697036
547906
573979
284433
648844
2
68923
775001
562105
2
108820
1634
1634
1634
491073
2
2
649389
452687
582555
411947
628868
438351
666529
167025
453475
2
83318
2
2
2
2
551799
2
492125
228226
2
425520
212765
2
2
203271
296108
2
551471
16490
580088
111103
180565
46175
30687
30687
420413
267395
17286
428195
697346
17815
17815
105090
660701
205762
176973
45702
45702
404359
401769
550530
2
335336
262883
577306
493072
523520
231127
404185
2
766582
126327
792930
303486
540447
446360
2
259352
672275
20625
2
2
490045
4164
2332
98061
216124
311091
5173
2
5173
2
199754
655173
414660
76320
491287
420540
34425
34425
2
668350
2
445849
476
13812
23860
10785
618903
2
711757
2
57035
2


In [20]:
df.tail()

Unnamed: 0,index,author,created_at,location,description,followers,following,favourites_count,statuses_count,tweets
437,601,KiddKayio,2018-04-06 04:45:04,,https://t.co/ZKbsctNBQU\nHago musiquita??,171,303,20702,7727,"[{""id"": 1356004751125721092, ""created_at"": ""20..."
438,605,EduardoLamadrid,2010-07-23 01:11:08,"Caguas, Puerto Rico",,222,399,1009,533,"[{""id"": 1354880270114181120, ""created_at"": ""20..."
440,607,DiBUJO_TV,2016-07-26 05:03:26,"Aibonito, PR","Especialista en artes de Video Juegos, Anime o...",47,178,2302,1368,"[{""id"": 1356022122716532738, ""created_at"": ""20..."
441,608,_Adriari_,2013-06-21 05:07:24,"Vega Baja, Puerto Rico",,645,498,3227,45752,"[{""id"": 1356040793824374784, ""created_at"": ""20..."
442,609,Rosynelle,2011-08-18 16:50:33,"Rio Grande, PR","Creo en Dios, sus milagros, portentos y mara...",62,795,18214,7941,"[{""id"": 1355848250360815616, ""created_at"": ""20..."


In [22]:
x = concatTweets(df)


x.tail()

Unnamed: 0,id,created_at,text,coordinates,geo,user,retweet_count,favorite_count
572213,1202372787555442691,2019-12-04T23:43:06,@Lis_Milland Hizo cruzadas de evangelismo y ll...,,,Rosynelle,0,0
572214,1202372264769081344,2019-12-04T23:41:01,RT @drsamuelpagan: Una muy buena noticia: Hoy ...,,,Rosynelle,215,0
572215,1201678454128873472,2019-12-03T01:44:04,RT @notiseis360pr: Pronta recuperación al exma...,,,Rosynelle,2,0
572216,1201656614908022784,2019-12-03T00:17:17,RT @drcarlosmellado: Hoy estuve con Liliam y J...,,,Rosynelle,301,0
572217,1201653951009099776,2019-12-03T00:06:42,RT @Lis_Milland: Gracias Amos Diaz por invitar...,,,Rosynelle,1,0


### Word2vec

In [356]:
model = Word2Vec(documento.data["clean_text"], min_count=0, workers=20, window=2,  alpha=0.02, hs=1)



In [503]:
my_dict = dict({})
for idx, key in enumerate(model.wv.key_to_index):
    my_dict[key] = model.wv[key]

In [504]:
vectores = []
for a in documento.data.clean_text:
    t  = []
    for word in a:
        try:
            t.append(my_dict[word])
        except:
            continue
    
    vectores.append(t)
        
documento.data["vectorized_text"] = vectores
documento.data.tail()

Unnamed: 0,text,clean_text,vectorized_text
113,": El ajusta la magnitud a , en el límite de pr...","[El, ajusta, magnitud, limite, producir, tsunami]","[[-0.017272485, 0.008652826, 0.007515971, 0.00..."
114,: “ Van a seguir ocurriendo eventos fuertes. H...,"[Van, seguir, ocurriendo, eventos, fuertes, Ha...","[[-0.0027897793, -0.0016228552, -0.007687815, ..."
115,: Casa colapsada en Yauco.,"[Casa, colapsada, Yauco]","[[0.008535036, 0.013252284, 0.00035941493, 0.0..."
116,: “Las réplicas podrían ser de magnitud mayor ...,"[Las, replicas, podrian, ser, magnitud, mayor,...","[[0.0029013832, -0.0027108788, -0.0015446608, ..."
117,": PRELIMINAR -01-06 :58:03 No hay Aviso, Adver...","[PRELIMINAR, 06, 5803, No, Aviso, Advertencia,...","[[-0.0127012255, 0.0040803654, 0.009027934, -0..."


In [505]:
df.tail()

Unnamed: 0,id,author,created_at,location,description,verified,followers,following,favourites_count,statuses_count,lang,tweets,following_json,followers_json
5,1685491040,CoraimaINegron,2013-08-20 10:48:59,Puerto Rico,"“El karma te lo devolverá todo, excepto a la m...",False,64,203,1336,1565,,"[{""id"": 1283388114661244934, ""created_at"": ""20...","[{""id"": 299932350, ""author"": ""DMcIntyreWWE"", ""...","[{""id"": 713749428935462916, ""author"": ""DavidRo..."
6,1388202983964520449,Personn34091581,2021-04-30 18:46:05,Costa brava,,False,0,2,9,7,,[],"[{""id"": 560803492, ""author"": ""UrbanLePharaon"",...",[]
7,908687401123553280,rsantanafonseca,2017-09-15 13:42:13,"COPU, UPRRP","19 (+3) | ?????? | Pop culture enthusiast, soc...",False,891,620,121676,124488,,[],"[{""id"": 755882683, ""author"": ""_vidalysrms"", ""c...","[{""id"": 1344047685716815872, ""author"": ""LeQuee..."
8,1346688657365889024,SJCiudadCapital,2021-01-06 05:23:04,,Cuenta Oficial de la Ciudad Capital de Puerto ...,False,2915,183,291,1157,,"[{""id"": 1355954464436473856, ""created_at"": ""20...","[{""id"": 1309256724830969857, ""author"": ""estefa...","[{""id"": 1271152649304518657, ""author"": ""JCruz_..."
9,521836439,Angelicv_G,2012-03-12 01:12:16,Puerto Rico,UT • yo no sé que poner aquí??,False,962,519,115665,130883,,"[{""id"": 1356041673227313154, ""created_at"": ""20...","[{""id"": 1192074658994180096, ""author"": ""Nsnili...","[{""id"": 1383464766283280389, ""author"": ""geegee..."


# Topic Modeling 

In [24]:
tweets = Document(x)

covid = '''covid pandemia coronavirus vacuna cuarentena virus mascara propagacion precaucion covid19 ICU hospital casos sospechoso probable vacunacion pfizer moderna jonhson camillas hospitales probabes serologica '''
politica = '''gobierno politica ppd pnp pip independentista popular wanda pierluisi gobernador senado camara legislador proyecto senadores legisladores proyecto bono representante ley veto departamento educacion salud hacienda  '''
emociones = '''depresion ansiedad felicidad emocion salud mental tristesa triste alegre contento ansioso contento deprimido anxiety '''
eventos = '''terremoto tsunami huracan lluvia calor tormenta terremotos magnitud temblores guanica ponce mayaguez estructuras grietas  '''

string1 = covid
words = string1.split()
covid = " ".join(sorted(set(words), key=words.index))

string1 = politica
words = string1.split()
politica = " ".join(sorted(set(words), key=words.index))

string1 = emociones
words = string1.split()
emociones = " ".join(sorted(set(words), key=words.index))

string1 = eventos
words = string1.split()
eventos = " ".join(sorted(set(words), key=words.index))


covid_scores = get_scores(covid, tweets.data.clean_text)
politica_scores = get_scores(politica, tweets.data.clean_text)
emociones_scores = get_scores(emociones, tweets.data.clean_text)
eventos_scores = get_scores(eventos, tweets.data.clean_text)


572218 572218


In [25]:
# create a jaccard scored df.
data  = {'names':x.user.to_list(),       'covid_scores':covid_scores,
         'politica_scores': politica_scores, 'emociones_scores':emociones_scores, 'eventos_scores':eventos_scores}
scores_df = pd.DataFrame(data)
#assign classes based on highest score
def get_classes(l1, l2, l3, l4):
    econ = []
    socio = []
    cul = []
    heal = []
    for i, j, k, l in zip(l1, l2, l3, l4):
        m = max(i, j, k, l)
        if m == i:
            econ.append(1)
        else:
            econ.append(0)
        if m == j:
            socio.append(1)
        else:
            socio.append(0)        
        if m == k:
            cul.append(1)
        else:
            cul.append(0)  
        if m == l:
            heal.append(1)
        else:
            heal.append(0)   
            
    return econ, socio, cul, heal

print(scores_df.head())
l1 = scores_df.covid_scores.to_list()
l2 = scores_df.politica_scores.to_list()
l3 = scores_df.emociones_scores.to_list()
l4 = scores_df.eventos_scores.to_list()
econ, socio, cul, heal = get_classes(l1, l2, l3, l4)

data = {'name': scores_df.names.to_list(), 'economic':econ, 'social':socio, 'culture':cul, 'health': heal}
class_df = pd.DataFrame(data)
#grouping the tweets by username
new_groups_df = class_df.groupby(['name']).sum()
#add a new totals column
new_groups_df['total'] = new_groups_df['health'] + new_groups_df['culture'] + new_groups_df['social'] +  new_groups_df['economic']
#add a new totals row
new_groups_df.loc["Total"] = new_groups_df.sum()  

             names  covid_scores  politica_scores  emociones_scores  \
0  themoonisironic      0.033333              0.0               0.0   
1  themoonisironic      0.000000              0.0               0.0   
2  themoonisironic      0.000000              0.0               0.0   
3  themoonisironic      0.000000              0.0               0.0   
4  themoonisironic      0.000000              0.0               0.0   

   eventos_scores  
0             0.0  
1             0.0  
2             0.0  
3             0.0  
4             0.0  


In [26]:
new_groups_df.head()

Unnamed: 0_level_0,economic,social,culture,health,total
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0913Y00N,295,270,309,268,1142
1yukiyu,2627,2572,2705,2572,10476
23jamel,2705,2709,2739,2709,10862
61_Shasta,1944,1160,1511,1153,5768
8Susiecolon,2116,2113,2274,2113,8616


In [None]:
sentences = userSubSets(tweets.data) 
sentiment_df = pd.DataFrame(sentences, columns=['sentence', 'score', 'magnitude'])

themoonisironic
a
bbelita23
a
CoraimaINegron
a
SJCiudadCapital
a
Angelicv_G
a
__lfc7
a
Enrique94494039
a
karywasabi13
a
ArekCell
a
_sarielys_
a
icarusio
a
KeylimarBirriel
a
valeilu4
a
teacup_talk
a
pelromaine
a
MColonCruz
a
PaganShay
a
Badillo83
a
lapatronski
a
koberayou
a
IZayasI
a
nnavm_og
a
23jamel
a
keyshla_oquendo
a
Gloriozah
a
daniellyuh
a
redirewolf
a
MTeresaVelez
a
JorgeCardonaPR
a
lucassebamolina
a
Laurel_landrau
a
astronomopr
a
domenech_eileen
a
bugattimafalda
a
ElDonCarlangas
a
EdwinSuarezVaz1
a
kaleighhhhhh13
a
josephanavin
a
gardielysss16
a
MePuedesDecirKi
a
_rbli
a
begaytrash
a
colettephair
a
bcrystal_
a
Osman_PM
a
jotabarberrr
a
IOtaku15
a
notasiahh_
a
ivancupey
a
Albeeerlg
a
Meg_the_human5
a
IrizarryPabon
a
Janelly15_
a
RicardoEladio
a
cristiina_moura
a
CottoGlendaliz
a
gail767
a
MarilynCalo
a
bIuenoise
a
ASHtroswrld
a
baezbaezzz_
a
Esoj_DaGod
a
Nancy96630137
a
corvlito
a
advenie2
a
angelecintron
a
zuheymarie_
a
neery___
a
Yianizaret3
a
yariiany1
a
VictoriaLagun20
a
nat

In [None]:
sentiment_df.head()

In [101]:
dir(language_v1.Document.Type)

['HTML',
 'PLAIN_TEXT',
 'TYPE_UNSPECIFIED',
 '__class__',
 '__doc__',
 '__members__',
 '__module__']