In [6]:
!pip install emoji



In [7]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import emoji
import spacy
from operator import itemgetter
import itertools
from itertools import combinations


from nltk.stem.snowball import SnowballStemmer


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from tqdm import tqdm
tqdm.pandas()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Stopwords and spacy model
* We extract a list of stopwords from - https://gist.github.com/sebleier/554280 because the available list of NLTK/Spacy stopwords are highly insufficient and incomplete.
* We extract the Spacy's english pipeline optimized for CPU and will be using it for stemming and lemmatization tasks
* We also instantiate SnowballStemmer to perform stemming after lemmatization is done in later stages of the notebook

In [9]:
#### Stopwords list taken from - https://gist.github.com/sebleier/554280 (reason - the available list of NLTK/ Spacy stopwords are too less and don't cover all possible words)
stopwords_list = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz",]

nlp        = spacy.load("en_core_web_sm")

stemmer = SnowballStemmer(language='english')

## Custom preprocessing functions for Vocabulary extraction
We declare basic functions to perform tasks like removal of stopwords, stemming, lemmatization in batches, removal and emojis etc. 

In [10]:
def remove_stopwords(text):
    '''
      Removes stopwords from the text
    '''

    text_split   = text.split()

    text_list    = [word for word in text_split if not word in stopwords_list]

    return ' '.join(text_list)

def stemming_perform(text):
    '''
      Performs lemmatization in batches
    '''
    
    token = text.split()
    stemmed_text = []
    for tok in token:
        stemmed_text.append(stemmer.stem(tok))
    return ' '.join(stemmed_text)


def cleaning_text(text):
    '''
      Operations performed:- 
      1. Converting the entire text to lowercase
      2. Removal of punctuations from the text
      3. Removal of numbers from the text
    '''

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = remove_stopwords(text)

    text = text.replace('amp','') ## this was actually ampersand which kept coming in frequently that made no sense
    return text


def give_emoji_free_text(text):
    ''' 
      Removes all possible emojis from the text (because our text is basically tweets that can have emoijs).
      The input is a text and the output is emoji free text
    '''
    allchars = [str for str in text.encode().decode('utf-8')]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.encode().decode('utf-8').split() if not any(i in str for i in emoji_list)])
    return clean_text


def preprocess_custom(text):
    text = give_emoji_free_text(text)
    text = cleaning_text(text)

    return text

def lemmatize_batches(docs):
    '''
      Performs lemmatization in batches
    '''
    lemmatized_text = []

    for doc in nlp.pipe(docs, batch_size=256, n_process=3,disable=["parser", "ner"]):
        lemmatized_text.append(str(' '.join([token.lemma_ for token in doc])))
    return lemmatized_text


def lemmatize_df(df):
    lemmatized = []
    for i in tqdm(range(0,df.shape[0],1000)):
        z = lemmatize_batches(df['preprocessed_text'].iloc[i:i+1000])
        lemmatized.extend(z)
    df['lemmatize'] = lemmatized
    return df



def return_frequency_dict_sorted(df):
    sent = df['stemmed'].values
    sent = ' '.join(sent)

    fdist = FreqDist()
    
    for word in word_tokenize(sent):
        fdist[word.lower()] += 1
        
    return sorted(fdist.items(), key=lambda x: x[1], reverse=True)

def replace(prob,x2,thresh1,thresh2):
    
    if str(prob)!='nan':
        if ((prob>thresh1) and (prob<thresh2)):
            return 'Harmful_Traditional_practice'
        else:
            return x2
    else:
        return x2


## Vocab extraction and Preprocessing Stage
We will preprocess the text in two stages
1. Here we will consider only the 'Harmful_traditional practice' class and find the vocabulary corresponding to it
2. Next we will consider the class 'sexual_violence' and find the vocabulary corresponding to it.
3. Out combined vocabulary will be the vocabulary from both the stages

####  Our input data will be the Data from previous stage

## Text Preprocessing for Vocab extraction for class 'Harmful Traditional Practices'
We preprocess the test data tweets and then find the frequency occurance of the preprocessed words from the dataset. We will now select top 20 words from the test data tweets which will be used as a vocabulary for building our model. We consider only the class ' Harmful_Traditional_practice' 

In [12]:
subdf   = pd.read_csv('/content/gdrive/MyDrive/Hackathon/outcome_stage3.csv')

subdf   = subdf[(subdf['type']=='Harmful_Traditional_practice')]

print(f'The shape of subsetted data with subsetting on Harmful_Traditional_practice will be {subdf.shape}')

test_df = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Test.csv')

temp_df = subdf.merge(test_df,on=['Tweet_ID'],how='left')

_       = temp_df.rename(columns={'type':'type1'},inplace=True)

print(f'The shape of data after assigning the labels attained from BERT to test is {temp_df.shape}')

temp_df['preprocessed_text'] = temp_df['tweet'].progress_apply(preprocess_custom)
temp_df                      = lemmatize_df(temp_df)
temp_df['stemmed']           = temp_df['lemmatize'].progress_apply(stemming_perform)
sorted_freq_dict             = return_frequency_dict_sorted(temp_df)

threshold2consider           = 5
vocab_htp                    = [word[0] for word in sorted_freq_dict[:threshold2consider]]


The shape of subsetted data with subsetting on Harmful_Traditional_practice will be (3158, 2)
The shape of data after assigning the labels attained from BERT to test is (3158, 3)


100%|██████████| 3158/3158 [00:01<00:00, 1606.65it/s]
100%|██████████| 4/4 [00:06<00:00,  1.66s/it]
100%|██████████| 3158/3158 [00:00<00:00, 3512.10it/s]


In [13]:
print('The top 10 words (preprocessed) based on the frequency distribution and their relative frequencies are given below')
pd.DataFrame(sorted_freq_dict,columns = ['word','Frequency']).head(10)

The top 10 words (preprocessed) based on the frequency distribution and their relative frequencies are given below


Unnamed: 0,word,Frequency
0,forc,3374
1,fgm,2861
2,marriag,2415
3,woman,1513
4,child,1192
5,girl,1187
6,femal,593
7,rape,522
8,sex,485
9,violenc,465


# Classification model on the vocabulary attained
We will now preprocess our train and BERT Classified physical and sexual data to finetune it <br>
To perform that our test data was already preprocessed in the previous step. We will now preprocess our train dataset

## Text Preprocessing on Train data
We subset our train data having only Physical Violence and Harmful Traditional Practices labels and perform text preprocessing on train and test data

In [15]:
test_df                         = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Test.csv')
subdf                           = pd.read_csv('/content/gdrive/MyDrive/Hackathon/outcome_stage3.csv')

test_df                         = test_df.merge(subdf,on=['Tweet_ID'],how='left')


subdf                           = subdf[(subdf['type']=='Harmful_Traditional_practice') | (subdf['type']=='Physical_violence')]


temp_df                         = subdf.merge(test_df,on=['Tweet_ID'],how='left')

_                               = temp_df.rename(columns={'type':'type1'},inplace=True)

temp_df['preprocessed_text']    = temp_df['tweet'].progress_apply(preprocess_custom)
temp_df                         = lemmatize_df(temp_df)
temp_df['stemmed']              = temp_df['lemmatize'].progress_apply(stemming_perform)


train_df                        = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Train.csv')
train_data                      = train_df[train_df['type'].isin(['Harmful_Traditional_practice','Physical_violence'])]
train_data                      = train_data.reset_index()

train_data['preprocessed_text'] = train_data['tweet'].progress_apply(preprocess_custom)
train_data                      = lemmatize_df(train_data)
train_data['stemmed']           = train_data['lemmatize'].progress_apply(stemming_perform)

print(test_df.shape,train_df.shape,temp_df.shape)

100%|██████████| 6536/6536 [00:03<00:00, 1890.06it/s]
100%|██████████| 7/7 [00:07<00:00,  1.08s/it]
100%|██████████| 6536/6536 [00:01<00:00, 4373.26it/s]
100%|██████████| 6134/6134 [00:02<00:00, 2621.82it/s]
100%|██████████| 7/7 [00:06<00:00,  1.12it/s]
100%|██████████| 6134/6134 [00:00<00:00, 6750.89it/s]

(15581, 3) (39650, 3) (6536, 7)





## Count vectorization with binarization
We apply count vectorizer on the train and test datasets 

In [16]:
sample               = train_data[train_data['type']=='Physical_violence'].sample(500)
sample               = sample.append(train_data[train_data['type']!='Physical_violence'])
sample               = sample.reset_index(drop=True)
# sample = train_data.copy()

In [17]:
train_X              = sample['stemmed'].values
test_X               = temp_df['stemmed'].values

cv                   = CountVectorizer(vocabulary = vocab_htp,binary=True)

train                = cv.fit_transform(train_X)
test                 = cv.transform(test_X)

#### Converting the sparse matrices to dense 
train                = train.todense()
test                 = test.todense()
print(f'The shape of preprocessed train data matrix is {train.shape} and preprocessed test data matrix is {test.shape}')

### label encoding done here
sample['labels']     = sample['type'].map({'Physical_violence':0,'Harmful_Traditional_practice':1})

train_y              = sample['labels'].values

The shape of preprocessed train data matrix is (688, 5) and preprocessed test data matrix is (6536, 5)


## Fitting a model on preprocessed data and finetuning
* We fit a XGBClassifier model on the preprocessed data 
* We will now update the previous labels of sexual_violence and Physical_violence with the outcomes attained from the model 
* The outcomes will be used for second stage classification

In [18]:
sample['labels'].value_counts()

0    500
1    188
Name: labels, dtype: int64

In [19]:
from xgboost import XGBClassifier
model      = XGBClassifier(n_estimators =40,random_state=13,max_depth = 2,scale_pos_weight=np.sqrt(500/188))
_          = model.fit(train,train_y)

In [20]:

temp_df['prob']  = model.predict_proba(test)[:,1]

findf            = test_df.merge(temp_df,on = ['Tweet_ID','tweet'],how='left')


In [21]:
### We update the previous classes with the updated ones 
findf['fintype'] = findf.apply(lambda z: replace(z['prob'],z['type'],0.95,0.99),axis=1)
findf['flag']    = (findf['type']!=findf['fintype']).astype(int)

print(f'Ensuring the shape of outcome dataframe is maintained, we have shape of output data as - {findf.shape}')

Ensuring the shape of outcome dataframe is maintained, we have shape of output data as - (15581, 11)


In [22]:
findf['fintype'].value_counts()

sexual_violence                 8101
Physical_violence               3360
Harmful_Traditional_practice    3176
economic_violence                498
emotional_violence               446
Name: fintype, dtype: int64

## Reviewing some modifications 
We will see some mismatches done by BERT model that were modified in this stage of supervised classification based on machine learning 

In [23]:
mismatch = findf[findf['flag']==1][['Tweet_ID','tweet','type','fintype','prob']]
mismatch = mismatch.sort_values(by = ['prob'])
mismatch.head(20)

Unnamed: 0,Tweet_ID,tweet,type,fintype,prob
4779,ID_LHF3OKZS,"Welcome to the pearl of Africa, where a adult ...",Physical_violence,Harmful_Traditional_practice,0.958514
145,ID_NVNOM0GA,Halftime stats: WLU: -Six three-pointers made...,Physical_violence,Harmful_Traditional_practice,0.978076
13846,ID_677MK7CR,Halftime Stats: D&amp;E: - Shooting 42% from t...,Physical_violence,Harmful_Traditional_practice,0.978076
13681,ID_UQOQMCRH,Houston with a dominant 1H doubling up the Bea...,Physical_violence,Harmful_Traditional_practice,0.978076
12834,ID_LDY2322H,"Draymond Green had ZERO FGM, but his defensive...",Physical_violence,Harmful_Traditional_practice,0.978076
12501,ID_MSFPYJKH,Highlights of Yi Jianlian on Guangdong's winni...,Physical_violence,Harmful_Traditional_practice,0.978076
11947,ID_4IMPJKVT,Here's what Barret Peery did in his FIRST year...,Physical_violence,Harmful_Traditional_practice,0.978076
11437,ID_SU11V6HP,David Nwaba ranks in the 88th percentile as an...,Physical_violence,Harmful_Traditional_practice,0.978076
6541,ID_REQW5FGD,Tonight is the lowest-scoring first half of th...,Physical_violence,Harmful_Traditional_practice,0.978076
5636,ID_PXZ033U0,forced to rely on cuts and split action plays ...,Physical_violence,Harmful_Traditional_practice,0.978076


In [24]:
# mismatch.to_csv('check5.csv',index=False)

In [25]:
final_out = findf[['Tweet_ID','fintype']]
final_out = final_out.rename({'fintype':'type'},axis=1)
final_out.head(3)

Unnamed: 0,Tweet_ID,type
0,ID_D9ONL553,sexual_violence
1,ID_263YTILY,emotional_violence
2,ID_62VS6IXC,emotional_violence


In [26]:
final_out['type'].value_counts()

sexual_violence                 8101
Physical_violence               3360
Harmful_Traditional_practice    3176
economic_violence                498
emotional_violence               446
Name: type, dtype: int64

In [28]:
final_out.to_csv('outcome_stage4.csv',index=False)

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
