Part 1: Using word2vec for vectorisation of questions corpus

In [None]:
!pip install nltk gensim



In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # convert to lowercase and tokenize

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    return tokens

In [3]:
data = pd.read_csv("TCPD_QH.tsv", sep='\t', header=0, engine='python')

In [4]:
data.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,Sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,Bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...
2,10173116,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the India International Skill Cent...,(a) to (e) As on date 14 India International S...,"MEENAKASHI LEKHI, RAVINDRA KUMAR PANDEY","BJP, BJP","Delhi, Jharkhand","New delhi, Giridih","GEN, GEN","Female, Male",India International Skill Centre,https://loksabha.nic.in/Questions/QResult15.as...
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,Viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,Anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...


In [None]:
data.dtypes

id                    int64
date                 object
ls_number             int64
ministry             object
question_type        object
question_text        object
answer_text          object
member               object
party                object
state                object
constituency         object
constituency_type    object
gender               object
subject              object
link                 object
dtype: object

In [6]:
data.describe(include="all")

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link
count,298171.0,298171,298171.0,298171,298171,298171,298171,298171,298171,298171,298171,298171,298171,298171,298171
unique,,1307,,85,2,297726,295373,57037,17341,19686,54865,2643,1015,234261,298171
top,,2006-08-22T00:00:00,,Finance,Unstarred,(a) whether the information has since been col...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,ATHAWALE RAMDAS BANDU,BJP,Maharashtra,Hyderabad,GEN,Male,RURAL ELECTRIFICATION,https://loksabha.nic.in/Questions/QResult15.as...
freq,,256,,20497,272707,5,38,1602,69457,26752,2334,163919,193301,83,1
mean,56051480.0,,14.54948,,,,,,,,,,,,
std,365056500.0,,1.126849,,,,,,,,,,,,
min,2100118.0,,13.0,,,,,,,,,,,,
25%,11641060.0,,14.0,,,,,,,,,,,,
50%,13506520.0,,15.0,,,,,,,,,,,,
75%,15370360.0,,16.0,,,,,,,,,,,,


In [8]:
import numpy as np 
np.round(data.describe(include="all"), 2)
np.round(data.describe(include="all"), 2).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,298171.0,,,,56051483.81,365056467.49,2100118.0,11641064.5,13506516.0,15370364.5,3581100217.0
date,298171.0,1307.0,2006-08-22T00:00:00,256.0,,,,,,,
ls_number,298171.0,,,,14.55,1.13,13.0,14.0,15.0,16.0,16.0
ministry,298171.0,85.0,Finance,20497.0,,,,,,,
question_type,298171.0,2.0,Unstarred,272707.0,,,,,,,
question_text,298171.0,297726.0,(a) whether the information has since been col...,5.0,,,,,,,
answer_text,298171.0,295373.0,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,38.0,,,,,,,
member,298171.0,57037.0,ATHAWALE RAMDAS BANDU,1602.0,,,,,,,
party,298171.0,17341.0,BJP,69457.0,,,,,,,
state,298171.0,19686.0,Maharashtra,26752.0,,,,,,,


In [5]:
data = data.dropna()

In [None]:
dataset = data['question_text']   # storing the questions column
dataset.astype(str)

0         (a) the number of cases which came to notice w...
1         (a) the salient features of the Community Radi...
2         (a) whether the India International Skill Cent...
3         Will the Minister of HOUSING AND URBAN AFFAIRS...
4         (a) whether the Government proposes to carry o...
                                ...                        
298288    (a) whether there is any data sharing policy w...
298289    (a) the present status of various ongoing rail...
298290    (a) whether the Government has tried to assess...
298291    (a) whether the Government has received recomm...
298292    (a) the details of number of cases pending in ...
Name: question_text, Length: 298171, dtype: object

In [None]:
processed_dataset = [preprocess_text(text) for text in data['question_text'].astype(str)]

In [None]:
# train the Word2Vec model using the CBOW architecture
model = Word2Vec(sentences=processed_dataset, vector_size=100, window=5, min_count=1, workers=4, sg=0)

In [None]:
# finding words similar to 'woman'
similar_words = model.wv.most_similar('woman', topn=1000)

similar_words

[('couples', 0.7610868811607361),
 ('daughters', 0.7294248938560486),
 ('married', 0.7101222276687622),
 ('wife', 0.6932497620582581),
 ('husbands', 0.6872377991676331),
 ('spouse', 0.6798301339149475),
 ('his/her', 0.6721576452255249),
 ('mothers', 0.6683943271636963),
 ('person', 0.6636925339698792),
 ('adolescents', 0.6612431406974792),
 ('he/she', 0.6576167941093445),
 ('parents', 0.6543503403663635),
 ('husband', 0.6520535349845886),
 ('aspirants', 0.6492419838905334),
 ('separated', 0.6472859978675842),
 ('dies', 0.6471075415611267),
 ('kids', 0.6459189057350159),
 ('lawyers', 0.6434658765792847),
 ('divorced', 0.63853919506073),
 ('foetuses', 0.636870801448822),
 ('father', 0.6368151307106018),
 ('deserted', 0.6356186866760254),
 ('lactating', 0.6353264451026917),
 ('divorcee', 0.6351039409637451),
 ('sons', 0.6308828592300415),
 ('wives', 0.6296428442001343),
 ('philanthropists', 0.6272149085998535),
 ('expectant', 0.6235759854316711),
 ('minors', 0.6216840147972107),
 ('beatin

In [None]:
new_data = pd.DataFrame(similar_words, columns=['Word', 'Score'])

In [None]:
new_data.to_csv("words_data.csv")

In [None]:
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(similar_words)

In [None]:
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Part 2: PSM and outcome analysis

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [None]:
df = data.copy()

In [None]:
df['question_text'] = df['question_text'].apply(preprocess_text)  # Use the preprocess_text function from earlier
df['answer_text'] = df['answer_text'].apply(preprocess_text)

In [None]:
df['woman_count'] = df['question_text'].apply(lambda x: x.count('woman')) # counting occurences per question

In [None]:
df.head()

In [None]:
# Define your independent variables (X) and dependent variable (y)
X = df[['ministry', 'party', 'state', 'constituency', 'constituency_type']]
y = df['gender'] == 'Female'  # Assuming you're using gender to define propensity

# Convert categorical variables to dummy variables
X = pd.get_dummies(X)

# Calculate propensity scores
log_reg = LogisticRegression()
log_reg.fit(X, y)
df['propensity_score'] = log_reg.predict_proba(X)[:, 1]