### Data processing



In [None]:
!pip install nltk gensim



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
import ast

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # convert to lowercase and tokenize

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    return tokens

In [None]:
data = pd.read_csv("/content/TCPD_QH.tsv", sep='\t', header=0, engine='python')

In [None]:
data.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,Sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,Bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...
2,10173116,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the India International Skill Cent...,(a) to (e) As on date 14 India International S...,"MEENAKASHI LEKHI, RAVINDRA KUMAR PANDEY","BJP, BJP","Delhi, Jharkhand","New delhi, Giridih","GEN, GEN","Female, Male",India International Skill Centre,https://loksabha.nic.in/Questions/QResult15.as...
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,Viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,Anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...


In [None]:
data.dtypes

id                    int64
date                 object
ls_number             int64
ministry             object
question_type        object
question_text        object
answer_text          object
member               object
party                object
state                object
constituency         object
constituency_type    object
gender               object
subject              object
link                 object
dtype: object

In [None]:
data.describe(include="all")

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link
count,298293.0,298293,298293.0,298293,298293,298251,298173,298293,298293,298293,298293,298293,298293,298291,298293
unique,,1307,,85,2,297731,295375,57051,17345,19691,54879,2643,1015,234344,298293
top,,2006-08-22T00:00:00,,Finance,Unstarred,URL_Not_Found,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,ATHAWALE RAMDAS BANDU,BJP,Maharashtra,Hyderabad,GEN,Male,RURAL ELECTRIFICATION,https://loksabha.nic.in/Questions/QResult15.as...
freq,,256,,20502,272810,77,38,1602,69485,26764,2334,163994,193359,83,1
mean,56076180.0,,14.549292,,,,,,,,,,,,
std,364996500.0,,1.127031,,,,,,,,,,,,
min,2100018.0,,13.0,,,,,,,,,,,,
25%,11640720.0,,14.0,,,,,,,,,,,,
50%,13506110.0,,15.0,,,,,,,,,,,,
75%,15370620.0,,16.0,,,,,,,,,,,,


In [None]:
data = data.dropna()

In [None]:
dataset = data['question_text']   # storing the questions column
dataset.astype(str)

0         (a) the number of cases which came to notice w...
1         (a) the salient features of the Community Radi...
2         (a) whether the India International Skill Cent...
3         Will the Minister of HOUSING AND URBAN AFFAIRS...
4         (a) whether the Government proposes to carry o...
                                ...                        
298288    (a) whether there is any data sharing policy w...
298289    (a) the present status of various ongoing rail...
298290    (a) whether the Government has tried to assess...
298291    (a) whether the Government has received recomm...
298292    (a) the details of number of cases pending in ...
Name: question_text, Length: 298171, dtype: object

In [None]:
# not needed for Zero shot
processed_dataset = [preprocess_text(text) for text in data['question_text'].astype(str)]

### Part 1: Using word2vec for vectorisation of questions corpus

In [None]:
# train the Word2Vec model using the CBOW architecture
model = Word2Vec(sentences=processed_dataset, vector_size=100, window=5, min_count=1, workers=4, sg=0)

In [None]:
# finding words similar to 'woman'
similar_words = model.wv.most_similar('woman', topn=1000)

similar_words

[('couples', 0.7610868811607361),
 ('daughters', 0.7294248938560486),
 ('married', 0.7101222276687622),
 ('wife', 0.6932497620582581),
 ('husbands', 0.6872377991676331),
 ('spouse', 0.6798301339149475),
 ('his/her', 0.6721576452255249),
 ('mothers', 0.6683943271636963),
 ('person', 0.6636925339698792),
 ('adolescents', 0.6612431406974792),
 ('he/she', 0.6576167941093445),
 ('parents', 0.6543503403663635),
 ('husband', 0.6520535349845886),
 ('aspirants', 0.6492419838905334),
 ('separated', 0.6472859978675842),
 ('dies', 0.6471075415611267),
 ('kids', 0.6459189057350159),
 ('lawyers', 0.6434658765792847),
 ('divorced', 0.63853919506073),
 ('foetuses', 0.636870801448822),
 ('father', 0.6368151307106018),
 ('deserted', 0.6356186866760254),
 ('lactating', 0.6353264451026917),
 ('divorcee', 0.6351039409637451),
 ('sons', 0.6308828592300415),
 ('wives', 0.6296428442001343),
 ('philanthropists', 0.6272149085998535),
 ('expectant', 0.6235759854316711),
 ('minors', 0.6216840147972107),
 ('beatin

In [None]:
new_data = pd.DataFrame(similar_words, columns=['Word', 'Score'])

In [None]:
new_data.to_csv("words_data.csv")

In [None]:
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(similar_words)

In [None]:
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Part 2: PSM and outcome analysis

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [None]:
df = data.copy()

In [None]:
df['question_text'] = df['question_text'].apply(preprocess_text)  # Use the preprocess_text function from earlier
df['answer_text'] = df['answer_text'].apply(preprocess_text)

In [None]:
df['woman_count'] = df['question_text'].apply(lambda x: x.count('woman')) # counting occurences per question

In [None]:
df.head()

In [None]:
# Define your independent variables (X) and dependent variable (y)
X = df[['ministry', 'party', 'state', 'constituency', 'constituency_type']]
y = df['gender'] == 'Female'  # Assuming you're using gender to define propensity

# Convert categorical variables to dummy variables
X = pd.get_dummies(X)

# Calculate propensity scores
log_reg = LogisticRegression()
log_reg.fit(X, y)
df['propensity_score'] = log_reg.predict_proba(X)[:, 1]

### Zero Shot Classification

In [None]:
pip install transformers==3.1.0

Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/884.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/884.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m686.1/884.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m884.0/884.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.8.1.rc2 (from transformers==3.1.0)
  Downloading tokenizers-0.8.1rc2.tar.gz (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?2

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("zero-shot-classification", batch_size=8, device=0) # to utilize GPU

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
labels = ['corruption', 'development', 'accountability and transparency', 'programmatic representation', 'clientelistic representation']

In [None]:
# cleaning data
data = data.dropna()
data = data.astype(str)


In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def preprocess(text):
    return " ".join(lemma.lemmatize(word) for word in text.lower().split() if word not in stop)

data['processed_text'] = data['question_text'].apply(preprocess)

In [None]:
data['processed_text']

0         (a) number case came notice regard fighting qu...
1         (a) salient feature community radio station (c...
2         (a) whether india international skill centre (...
3         minister housing urban affair pleased state: (...
4         (a) whether government proposes carry quarterl...
                                ...                        
298288    (a) whether data sharing policy different stat...
298289    (a) present status various ongoing railway pro...
298290    (a) whether government tried ass profitability...
298291    (a) whether government received recommendation...
298292    (a) detail number case pending madhya pradesh ...
Name: processed_text, Length: 298171, dtype: object

In [None]:
# using a subset of the data
data_sub = data.head(5)

def classify_text(text):
    # Perform classification
    results = classifier(text, labels, multi_label=True)
    # Return the labels and scores
    return results['labels'], results['scores']

# Apply the classification function to each row in 'processed_text'
data_sub['classification_results'] = data_sub['processed_text'].apply(lambda x: classify_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sub['classification_results'] = data_sub['processed_text'].apply(lambda x: classify_text(x))


In [None]:
# extracting the top category and score
data_sub['top_category'] = data_sub['classification_results'].apply(lambda x: x[0][0] if x else None)
data_sub['top_category_score'] = data_sub['classification_results'].apply(lambda x: x[1][0] if x else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sub['top_category'] = data_sub['classification_results'].apply(lambda x: x[0][0] if x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sub['top_category_score'] = data_sub['classification_results'].apply(lambda x: x[1][0] if x else None)


In [None]:
data_sub.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link,processed_text,classification_results,top_category,top_category_score
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,Sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...,(a) number case came notice regard fighting qu...,"([accountability and transparency, development...",accountability and transparency,0.818371
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,Bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...,(a) salient feature community radio station (c...,"([programmatic representation, accountability ...",programmatic representation,0.948672
2,10173116,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the India International Skill Cent...,(a) to (e) As on date 14 India International S...,"MEENAKASHI LEKHI, RAVINDRA KUMAR PANDEY","BJP, BJP","Delhi, Jharkhand","New delhi, Giridih","GEN, GEN","Female, Male",India International Skill Centre,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether india international skill centre (...,"([programmatic representation, accountability ...",programmatic representation,0.84173
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,Viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...,minister housing urban affair pleased state: (...,"([accountability and transparency, programmati...",accountability and transparency,0.912875
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,Anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government proposes carry quarterl...,"([accountability and transparency, programmati...",accountability and transparency,0.638563


In [None]:
df_questions = data.copy()

In [None]:
df_questions.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link,processed_text
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,Sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...,(a) number case came notice regard fighting qu...
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,Bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...,(a) salient feature community radio station (c...
2,10173116,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the India International Skill Cent...,(a) to (e) As on date 14 India International S...,"MEENAKASHI LEKHI, RAVINDRA KUMAR PANDEY","BJP, BJP","Delhi, Jharkhand","New delhi, Giridih","GEN, GEN","Female, Male",India International Skill Centre,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether india international skill centre (...
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,Viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...,minister housing urban affair pleased state: (...
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,Anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government proposes carry quarterl...


In [None]:
df_questions = df_questions[df_questions['gender'].isin(['Male', 'Female'])]

In [None]:
df_questions.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link,processed_text
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,Sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...,(a) number case came notice regard fighting qu...
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,Bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...,(a) salient feature community radio station (c...
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,Viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...,minister housing urban affair pleased state: (...
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,Anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government proposes carry quarterl...
6,10172716,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the Government has framed rules an...,(a) to (e) The Ministry of Skill Development a...,ANJU BALA,BJP,Uttar Pradesh,Misrikh,SC,Female,Retired Government Officers as Consultants,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government framed rule guideline h...


In [None]:
df_constituency = pd.read_csv("/content/TCPD_GE_All_States_2024-1-27.csv")

  df_constituency = pd.read_csv("/content/TCPD_GE_All_States_2024-1-27.csv")


In [None]:
df_constituency.head()

Unnamed: 0,State_Name,Assembly_No,Constituency_No,Year,month,Poll_No,DelimID,Position,Candidate,Sex,...,No_Terms,Turncoat,Incumbent,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type
0,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,1,KULDEEP RAI SHARMA,M,...,1.0,False,False,True,Graduate Professional,Business,,Social Work,,Lok Sabha Election (GE)
1,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,2,VISHAL JOLLY,M,...,0.0,False,False,False,Graduate Professional,Liberal Profession or Professional,Lawyer,,,Lok Sabha Election (GE)
2,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,3,PARITOSH KUMAR HALDAR,M,...,0.0,False,False,False,Post Graduate,Agriculture,,,,Lok Sabha Election (GE)
3,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,4,SANJAY MESHACK,M,...,0.0,False,False,True,12th Pass,Business,,Politics,Municipality Member,Lok Sabha Election (GE)
4,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,5,PRAKASH MINJ,M,...,0.0,False,False,False,Post Graduate,Social Work,,,,Lok Sabha Election (GE)


In [None]:
df_constituency = df_constituency[df_constituency['Position'] == 1]

In [None]:
df_constituency.head()

Unnamed: 0,State_Name,Assembly_No,Constituency_No,Year,month,Poll_No,DelimID,Position,Candidate,Sex,...,No_Terms,Turncoat,Incumbent,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type
0,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,1,KULDEEP RAI SHARMA,M,...,1.0,False,False,True,Graduate Professional,Business,,Social Work,,Lok Sabha Election (GE)
16,Andhra_Pradesh,17,1,2019,4.0,0,4,1,GODDETI. MADHAVI,F,...,1.0,False,False,False,Graduate Professional,Social Work,,,,Lok Sabha Election (GE)
27,Andhra_Pradesh,17,2,2019,4.0,0,4,1,KINJARAPU RAM MOHAN NAIDU,M,...,1.0,False,False,False,Post Graduate,Politics,,,,Lok Sabha Election (GE)
37,Andhra_Pradesh,17,3,2019,4.0,0,4,1,Bellana Chandra Sekhar,M,...,1.0,False,False,False,Graduate Professional,Business,,Liberal Profession or Professional,Lawyer,Lok Sabha Election (GE)
52,Andhra_Pradesh,17,4,2019,4.0,0,4,1,M V V SATYANARAYANA,M,...,1.0,False,False,False,10th Pass,Business,Real Estate or Builder or Developer or Constru...,Salaried Work or Employed,Managerial Position,Lok Sabha Election (GE)


In [None]:
# lowercase
df_questions['constituency'] = df_questions['constituency'].str.lower()
df_constituency['Constituency_Name'] = df_constituency['Constituency_Name'].str.lower()

In [None]:
df_questions.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link,processed_text
0,10173416,2017-08-02T00:00:00,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,sitapur,GEN,Male,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...,(a) number case came notice regard fighting qu...
1,10173216,2017-08-02T00:00:00,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,bellary,ST,Male,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...,(a) salient feature community radio station (c...
3,10173016,2017-08-02T00:00:00,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,viluppuram,SC,Male,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...,minister housing urban affair pleased state: (...
4,10172916,2017-08-02T00:00:00,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,anand,GEN,Male,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government proposes carry quarterl...
6,10172716,2017-08-02T00:00:00,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the Government has framed rules an...,(a) to (e) The Ministry of Skill Development a...,ANJU BALA,BJP,Uttar Pradesh,misrikh,SC,Female,Retired Government Officers as Consultants,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government framed rule guideline h...


In [None]:
df_constituency['Constituency_Name']

0        andaman & nicobar islands
16                           aruku
27                      srikakulam
37                    vizianagaram
52                   visakhapatnam
                   ...            
91655                     amravati
91657                       tumkur
91660                 chikballapur
91662                    bangalore
91665                      hathras
Name: Constituency_Name, Length: 8291, dtype: object

In [None]:
# 'date' is a string column
df_questions['date'] = pd.to_datetime(df_questions['date'])
df_questions['Year'] = df_questions['date'].dt.year  # Extract year

In [None]:
df_constituency['Year'] = df_constituency['Year'].astype(int)

In [None]:
# Standardize gender coding in df_questions
gender_map = {'Male': 'M', 'Female': 'F'}
df_questions['gender'] = df_questions['gender'].replace(gender_map)

In [None]:
# Trying match identifiers

df_questions['constituency'] = df_questions['constituency'].str.lower().str.replace(' ', '_')
df_questions['constituency_year'] = df_questions['constituency'] + '_' + df_questions['Year'].astype(str)

df_constituency['Constituency_Name'] = df_constituency['Constituency_Name'].str.lower().str.replace(' ', '_')
df_constituency['constituency_year'] = df_constituency['Constituency_Name'] + '_' + df_constituency['Year'].astype(str)

In [None]:
df_questions.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,constituency,constituency_type,gender,subject,link,processed_text,Year,constituency_year
0,10173416,2017-08-02,16,Railways,Unstarred,(a) the number of cases which came to notice w...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,RAJESH VERMA,BJP,Uttar Pradesh,sitapur,GEN,M,Cases of Fighting/Quarrels in Trains,https://loksabha.nic.in/Questions/QResult15.as...,(a) number case came notice regard fighting qu...,2017,sitapur_2017
1,10173216,2017-08-02,16,Information and Broadcasting,Unstarred,(a) the salient features of the Community Radi...,THE MINISTER OF STATE OF THE MINISTRY OF INFOR...,B. SREERAMULU,BJP,Karnataka,bellary,ST,M,Community Radio Station,https://loksabha.nic.in/Questions/QResult15.as...,(a) salient feature community radio station (c...,2017,bellary_2017
3,10173016,2017-08-02,16,Housing and Urban Affairs,Unstarred,Will the Minister of HOUSING AND URBAN AFFAIRS...,THE MINISTER OF STATE IN THE MINISTRY OF HOUSI...,RAJENDRAN S,ADMK,Tamil Nadu,viluppuram,SC,M,MoU with Reckitt Benckiser,https://loksabha.nic.in/Questions/QResult15.as...,minister housing urban affair pleased state: (...,2017,viluppuram_2017
4,10172916,2017-08-02,16,Minority Affairs,Unstarred,(a) whether the Government proposes to carry o...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR MIN...,DILIP PATEL,BJP,Gujarat,anand,GEN,M,Employment Surveys,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government proposes carry quarterl...,2017,anand_2017
6,10172716,2017-08-02,16,Skill Development and Entrepreneurship,Unstarred,(a) whether the Government has framed rules an...,(a) to (e) The Ministry of Skill Development a...,ANJU BALA,BJP,Uttar Pradesh,misrikh,SC,F,Retired Government Officers as Consultants,https://loksabha.nic.in/Questions/QResult15.as...,(a) whether government framed rule guideline h...,2017,misrikh_2017


In [None]:
df_constituency.head()

Unnamed: 0,State_Name,Assembly_No,Constituency_No,Year,month,Poll_No,DelimID,Position,Candidate,Sex,...,Turncoat,Incumbent,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type,constituency_year
0,Andaman_&_Nicobar_Islands,17,1,2019,4.0,0,4,1,KULDEEP RAI SHARMA,M,...,False,False,True,Graduate Professional,Business,,Social Work,,Lok Sabha Election (GE),andaman_&_nicobar_islands_2019
16,Andhra_Pradesh,17,1,2019,4.0,0,4,1,GODDETI. MADHAVI,F,...,False,False,False,Graduate Professional,Social Work,,,,Lok Sabha Election (GE),aruku_2019
27,Andhra_Pradesh,17,2,2019,4.0,0,4,1,KINJARAPU RAM MOHAN NAIDU,M,...,False,False,False,Post Graduate,Politics,,,,Lok Sabha Election (GE),srikakulam_2019
37,Andhra_Pradesh,17,3,2019,4.0,0,4,1,Bellana Chandra Sekhar,M,...,False,False,False,Graduate Professional,Business,,Liberal Profession or Professional,Lawyer,Lok Sabha Election (GE),vizianagaram_2019
52,Andhra_Pradesh,17,4,2019,4.0,0,4,1,M V V SATYANARAYANA,M,...,False,False,False,10th Pass,Business,Real Estate or Builder or Developer or Constru...,Salaried Work or Employed,Managerial Position,Lok Sabha Election (GE),visakhapatnam_2019


In [None]:
# Merging the datasets on the unique identifier
df_merged = pd.merge(df_questions, df_constituency, on='constituency_year', how='right')

In [None]:
df_merged.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,No_Terms,Turncoat,Incumbent,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type
0,,NaT,,,,,,,,,...,1.0,False,False,True,Graduate Professional,Business,,Social Work,,Lok Sabha Election (GE)
1,,NaT,,,,,,,,,...,1.0,False,False,False,Graduate Professional,Social Work,,,,Lok Sabha Election (GE)
2,318415516.0,2019-02-04,16.0,Labour and Employment,Unstarred,(a) whether the Government has set sectoral jo...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,1.0,False,False,False,Post Graduate,Politics,,,,Lok Sabha Election (GE)
3,31751616.0,2019-02-04,16.0,Labour and Employment,Starred,(a) whether it is true that the unemployment r...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,1.0,False,False,False,Post Graduate,Politics,,,,Lok Sabha Election (GE)
4,317821216.0,2019-02-04,16.0,Labour and Employment,Unstarred,(a)whether the Government is aware of the skew...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,1.0,False,False,False,Post Graduate,Politics,,,,Lok Sabha Election (GE)


In [None]:
print(df_merged.isnull().sum())

id                        6731
date                      6731
ls_number                 6731
ministry                  6731
question_type             6731
                         ...  
TCPD_Prof_Main            7444
TCPD_Prof_Main_Desc      29178
TCPD_Prof_Second         30902
TCPD_Prof_Second_Desc    34267
Election_Type                0
Length: 63, dtype: int64


In [None]:
df_constituency.shape[0]

8291

In [None]:
df_merged.shape[0]

35294

In [None]:
df_merged = df_merged.drop(columns=['TCPD_Prof_Main', 'TCPD_Prof_Main_Desc', 'TCPD_Prof_Second',
                                    'TCPD_Prof_Second_Desc', 'Election_Type',
                                    ])

In [None]:
df_merged = df_merged[df_merged['question_text'].notna()]

In [None]:
df_merged.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,Last_Party,Last_Party_ID,Last_Constituency_Name,Same_Constituency,Same_Party,No_Terms,Turncoat,Incumbent,Recontest,MyNeta_education
2,318415516,2019-02-04,16,Labour and Employment,Unstarred,(a) whether the Government has set sectoral jo...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,,,,,,1.0,False,False,False,Post Graduate
3,31751616,2019-02-04,16,Labour and Employment,Starred,(a) whether it is true that the unemployment r...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,,,,,,1.0,False,False,False,Post Graduate
4,317821216,2019-02-04,16,Labour and Employment,Unstarred,(a)whether the Government is aware of the skew...,MINISTER OF STATE (IC) FOR LABOUR AND EMPLOYME...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,,,,,,1.0,False,False,False,Post Graduate
5,313857616,2019-02-06,16,Defence,Unstarred,(a) whether it is true that the capex in defen...,MINISTER OF STATE (DR. SUBHASH BHAMRE) IN THE ...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,,,,,,1.0,False,False,False,Post Graduate
6,386106216,2019-02-08,16,Finance,Unstarred,(a) whether the public sector asset expansion ...,THE MINISTER OF STATE FOR FINANCE (SHRI PON RA...,RAMMOHAN NAIDU KINJARAPU,TDP,Andhra Pradesh,...,,,,,,1.0,False,False,False,Post Graduate


In [None]:
df_merged.to_csv('merged_dataset.csv', index=False)

In [None]:
# checking for treatment constituencies
df_constituency_sorted = df_constituency.sort_values(by=['Constituency_No', 'Year'])

# Identify changes in gender representation
df_constituency_sorted['previous_sex'] = df_constituency_sorted.groupby('Constituency_No')['Sex'].shift(1)
df_constituency_sorted = df_constituency_sorted.dropna(subset=['previous_sex'])
df_constituency_sorted['gender_change'] = df_constituency_sorted['Sex'] != df_constituency_sorted['previous_sex']


# Filter for treated constituencies
treated_constituencies = df_constituency_sorted[df_constituency_sorted['gender_change']]

In [None]:
df_constituency_sorted

Unnamed: 0,State_Name,Assembly_No,Constituency_No,Year,month,Poll_No,DelimID,Position,Candidate,Sex,...,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type,constituency_year,previous_sex,gender_change
89778,Assam,3,1,1962,2.0,0,1,1,JYOTSNA CHANDA,F,...,False,,,,,,Lok Sabha Election (GE),cachar_1962,M,True
89819,Bihar,3,1,1962,2.0,0,1,1,KAMAL NATH TEWARI,M,...,False,,,,,,Lok Sabha Election (GE),bagaha_1962,F,True
90052,Delhi,3,1,1962,2.0,0,1,1,MEHR CHAND KHANNA,M,...,False,,,,,,Lok Sabha Election (GE),new_delhi_1962,M,False
90080,Gujarat,3,1,1962,2.0,0,1,1,M. K. S. HIMATSINHJI VIJARAJJI,M,...,False,,,,,,Lok Sabha Election (GE),kutch_1962,M,False
90148,Himachal_Pradesh,3,1,1962,2.0,0,1,1,CHATTAR SINGH,M,...,False,,,,,,Lok Sabha Election (GE),chamba_1962,M,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70663,Uttar_Pradesh,9,85,1989,11.0,0,3,1,JAGPAL SINGH,M,...,False,,,,,,Lok Sabha Election (GE),hardwar_1989,M,False
63980,Uttar_Pradesh,10,85,1991,5.0,0,3,1,RAM SINGH,M,...,False,,,,,,Lok Sabha Election (GE),hardwar_1991,M,False
55225,Uttar_Pradesh,11,85,1996,4.0,0,3,1,HARPAL SATHI,M,...,False,,,,,,Lok Sabha Election (GE),hardwar_1996,M,False
41396,Uttar_Pradesh,12,85,1998,3.0,0,3,1,HARPAL SINGH SATHI,M,...,True,,,,,,Lok Sabha Election (GE),hardwar_1998,M,False


In [None]:
df_constituency_sorted.shape[0]

8143

In [None]:
treated_constituencies.shape[0]

1224

In [None]:
treated_constituencies

Unnamed: 0,State_Name,Assembly_No,Constituency_No,Year,month,Poll_No,DelimID,Position,Candidate,Sex,...,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type,constituency_year,previous_sex,gender_change
89778,Assam,3,1,1962,2.0,0,1,1,JYOTSNA CHANDA,F,...,False,,,,,,Lok Sabha Election (GE),cachar_1962,M,True
89819,Bihar,3,1,1962,2.0,0,1,1,KAMAL NATH TEWARI,M,...,False,,,,,,Lok Sabha Election (GE),bagaha_1962,F,True
91619,West_Bengal,3,1,1963,,1,1,1,P.C.BARMAN,,...,False,,,,,,Lok Sabha Election (GE),cooch_1963,M,True
87276,Assam,4,1,1967,2.0,0,2,1,J. CHANDA,F,...,True,,,,,,Lok Sabha Election (GE),cachar_1967,M,True
87323,Bihar,4,1,1967,2.0,0,2,1,B. RAUT,M,...,True,,,,,,Lok Sabha Election (GE),bagaha_1967,F,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89293,Uttar_Pradesh,4,80,1967,2.0,0,2,1,M.S. BHARTI,M,...,False,,,,,,Lok Sabha Election (GE),meerut_1967,F,True
81101,Uttar_Pradesh,7,80,1980,1.0,0,3,1,MOHSINA KIDWAI,F,...,True,,,,,,Lok Sabha Election (GE),meerut_1980,M,True
70596,Uttar_Pradesh,9,80,1989,11.0,0,3,1,HARISH PAL,M,...,False,,,,,,Lok Sabha Election (GE),meerut_1989,F,True
81139,Uttar_Pradesh,7,83,1980,1.0,0,3,1,GYATRI DEVI,F,...,False,,,,,,Lok Sabha Election (GE),kairana_1980,M,True


In [None]:
# Merging the datasets on the unique identifier
df_merged_final = pd.merge(df_questions, df_constituency_sorted, on='constituency_year', how='right')

In [None]:
df_merged_final.shape[0]

35129

In [None]:
df_merged_final = df_merged_final[df_merged_final['question_text'].notna()]
df_merged_final.shape[0]

28543

In [None]:
df_merged_final.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,Incumbent,Recontest,MyNeta_education,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type,previous_sex,gender_change
304,17350913,1999-12-09,13,Rural Development,Unstarred,(a) whether any scheme is under consideation o...,MINISTER OF STATE FOR RURAL DEVELOPMENT (SHRI ...,ABDUL RASHID SHAHEEN,JKN,Jammu & Kashmir,...,False,False,,,,,,Lok Sabha Election (GE),M,False
306,17354313,1999-12-01,13,Prime Minister,Unstarred,(a) whether the Government propose to conduct ...,THE MINISTER OF STATE IN THE DEPARTMENT OF ATO...,T. GOVINDAN,CPM,Kerala,...,True,True,,,,,,Lok Sabha Election (GE),M,False
320,17353813,1999-12-02,13,Rural Development,Unstarred,(a) The guidelines and directions issued by th...,Minister of State for RURAL DEVELOPMENT (SHRI ...,SAMAR CHOWDHURY,CPM,Tripura,...,True,True,,,,,,Lok Sabha Election (GE),M,False
324,10021313,2004-02-04,13,Shipping,Unstarred,(a) the time when the Sompen Hospital Ship arr...,MINISTER OF STATE IN THE MINISTRY OF SHIPPING ...,BISHNU PADA RAY,BJP,Andaman & Nicobar Islands,...,False,True,Post Graduate,Other,,,,Lok Sabha Election (GE),M,False
325,16344514,2004-08-16,14,Tourism,Unstarred,(a) whether the Government has any proposal to...,MINISTER OF STATE FOR TOURISM (INDEPENDENT CHA...,MANORANJAN BHAKTA,INC,Andaman & Nicobar Islands,...,False,True,Post Graduate,Other,,,,Lok Sabha Election (GE),M,False


In [None]:
df_merged_final.to_csv('merged_dataset_final.csv', index=False)

In [None]:
df_merged_final['processed_text']

304      (a) whether scheme consideation government all...
306      (a) whether government propose conduct nuclear...
320      (a) guideline direction issued planning commis...
324      (a) time sompen hospital ship arrived port bla...
325      (a) whether government proposal develop touris...
                               ...                        
35059    (a) whether employee railway covered health sc...
35060    (a) whether deforestation/felling tree going v...
35061    (a) detail step taken government conservation ...
35074    (a) whether government aware five star hotel m...
35075    (a) whether government proposes promote nation...
Name: processed_text, Length: 28543, dtype: object

In [None]:
# carrying out the zero shot classification on this data
df_merged_final['classification_results'] = df_merged_final['processed_text'].apply(lambda x: classify_text(x))
# extracting the top category and score
df_merged_final['top_category'] = df_merged_final['classification_results'].apply(lambda x: x[0][0] if x else None)
df_merged_final['top_category_score'] = df_merged_final['classification_results'].apply(lambda x: x[1][0] if x else None)

In [None]:
df_merged_final.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,TCPD_Prof_Main,TCPD_Prof_Main_Desc,TCPD_Prof_Second,TCPD_Prof_Second_Desc,Election_Type,previous_sex,gender_change,classification_results,top_category,top_category_score
304,17350913,1999-12-09,13,Rural Development,Unstarred,(a) whether any scheme is under consideation o...,MINISTER OF STATE FOR RURAL DEVELOPMENT (SHRI ...,ABDUL RASHID SHAHEEN,JKN,Jammu & Kashmir,...,,,,,Lok Sabha Election (GE),M,False,"([programmatic representation, accountability ...",programmatic representation,0.769578
306,17354313,1999-12-01,13,Prime Minister,Unstarred,(a) whether the Government propose to conduct ...,THE MINISTER OF STATE IN THE DEPARTMENT OF ATO...,T. GOVINDAN,CPM,Kerala,...,,,,,Lok Sabha Election (GE),M,False,"([accountability and transparency, programmati...",accountability and transparency,0.614412
320,17353813,1999-12-02,13,Rural Development,Unstarred,(a) The guidelines and directions issued by th...,Minister of State for RURAL DEVELOPMENT (SHRI ...,SAMAR CHOWDHURY,CPM,Tripura,...,,,,,Lok Sabha Election (GE),M,False,"([programmatic representation, accountability ...",programmatic representation,0.874857
324,10021313,2004-02-04,13,Shipping,Unstarred,(a) the time when the Sompen Hospital Ship arr...,MINISTER OF STATE IN THE MINISTRY OF SHIPPING ...,BISHNU PADA RAY,BJP,Andaman & Nicobar Islands,...,Other,,,,Lok Sabha Election (GE),M,False,"([accountability and transparency, clientelist...",accountability and transparency,0.433297
325,16344514,2004-08-16,14,Tourism,Unstarred,(a) whether the Government has any proposal to...,MINISTER OF STATE FOR TOURISM (INDEPENDENT CHA...,MANORANJAN BHAKTA,INC,Andaman & Nicobar Islands,...,Other,,,,Lok Sabha Election (GE),M,False,"([development, accountability and transparency...",development,0.952534


In [None]:
df_merged_final['classification_results'][304]

(['programmatic representation',
  'accountability and transparency',
  'clientelistic representation',
  'development',
  'corruption'],
 [0.7695783376693726,
  0.6634148359298706,
  0.2397715002298355,
  0.19319695234298706,
  0.018497761338949203])

In [None]:
df_merged_final = df_merged_final.drop(columns=['link', 'TCPD_Prof_Main', 'TCPD_Prof_Main_Desc', 'TCPD_Prof_Second',
                                    'TCPD_Prof_Second_Desc', 'Election_Type', 'MyNeta_education',
                                    'Recontest', 'Incumbent', 'Turncoat', 'No_Terms', 'Same_Party',
                                    'Same_Constituency', 'Last_Constituency_Name', 'Last_Party_ID',
                                    'Last_Party', 'Contested', 'last_poll', 'Party_ID', 'Party_Type_TCPD',
                                    'pid', 'ENOP', 'Year_x'
                                    ])

In [None]:
df_merged_final.rename(columns={"Year_y": "Year"})

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,Turnout_Percentage,Vote_Share_Percentage,Deposit_Lost,Margin,Margin_Percentage,previous_sex,gender_change,classification_results,top_category,top_category_score
304,17350913,1999-12-09,13,Rural Development,Unstarred,(a) whether any scheme is under consideation o...,MINISTER OF STATE FOR RURAL DEVELOPMENT (SHRI ...,ABDUL RASHID SHAHEEN,JKN,Jammu & Kashmir,...,25.11,43.94,no,36113.0,18.83,M,False,"([programmatic representation, accountability ...",programmatic representation,0.769578
306,17354313,1999-12-01,13,Prime Minister,Unstarred,(a) whether the Government propose to conduct ...,THE MINISTER OF STATE IN THE DEPARTMENT OF ATO...,T. GOVINDAN,CPM,Kerala,...,77.12,45.77,no,31578.0,3.41,M,False,"([accountability and transparency, programmati...",accountability and transparency,0.614412
320,17353813,1999-12-02,13,Rural Development,Unstarred,(a) The guidelines and directions issued by th...,Minister of State for RURAL DEVELOPMENT (SHRI ...,SAMAR CHOWDHURY,CPM,Tripura,...,68.88,57.46,no,198399.0,30.60,M,False,"([programmatic representation, accountability ...",programmatic representation,0.874857
324,10021313,2004-02-04,13,Shipping,Unstarred,(a) the time when the Sompen Hospital Ship arr...,MINISTER OF STATE IN THE MINISTRY OF SHIPPING ...,BISHNU PADA RAY,BJP,Andaman & Nicobar Islands,...,63.66,55.77,no,30500.0,19.83,M,False,"([accountability and transparency, clientelist...",accountability and transparency,0.433297
325,16344514,2004-08-16,14,Tourism,Unstarred,(a) whether the Government has any proposal to...,MINISTER OF STATE FOR TOURISM (INDEPENDENT CHA...,MANORANJAN BHAKTA,INC,Andaman & Nicobar Islands,...,63.66,55.77,no,30500.0,19.83,M,False,"([development, accountability and transparency...",development,0.952534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,14598216,2014-12-22,16,Railways,Unstarred,(a) whether the employees of Railways are cove...,MINISTER OF STATE IN THE MINISTRY OF RAILWAYS ...,ANUPRIYA SINGH PATEL,AD,Uttar Pradesh,...,58.56,43.32,no,219079.0,21.74,M,True,"([accountability and transparency, programmati...",accountability and transparency,0.750319
35060,14566316,2014-12-23,16,"Environment, Forests and Climate Change",Unstarred,(a) whether deforestation/felling of trees is ...,MINISTER OF STATE (INDEPENDENT CHARGE) FOR ENV...,ANUPRIYA SINGH PATEL,AD,Uttar Pradesh,...,58.56,43.32,no,219079.0,21.74,M,True,"([programmatic representation, accountability ...",programmatic representation,0.366077
35061,15517516,2014-07-21,16,Mines,Unstarred,(a) the details of steps taken by the Governme...,"THE MINISTER OF STATE FOR MINES, STEEL AND LAB...",ANUPRIYA SINGH PATEL,AD,Uttar Pradesh,...,58.56,43.32,no,219079.0,21.74,M,True,"([accountability and transparency, programmati...",accountability and transparency,0.795397
35074,15030816,2014-11-27,16,"Water Resources, River Development and Ganaga ...",Unstarred,(a) whether the Government is aware that five ...,"THE MINISTER OF STATE FOR WATER RESOURCES, RIV...",CHHOTELAL,BJP,Uttar Pradesh,...,54.05,42.69,no,190486.0,21.50,M,False,"([accountability and transparency, programmati...",accountability and transparency,0.905049


In [None]:
# cleaning up results

def process_classification_results(row):
    if isinstance(row, str):
        labels, scores = ast.literal_eval(row)
    else:
        labels, scores = row

    # dictionary with labels as keys and scores as values
    return dict(zip(labels, scores))

In [None]:
# putting the function to the 'classification_results' column
df_merged_final['processed_results'] = df_merged_final['classification_results'].apply(process_classification_results)

In [None]:
# separate columns for each classification result
classification_labels = ['programmatic representation', 'accountability and transparency', 'clientelistic representation', 'development', 'corruption']

for label in classification_labels:
    df_merged_final[label + '_score'] = df_merged_final['processed_results'].apply(lambda x: x.get(label, None))

In [None]:
df_merged_final.head()

Unnamed: 0,id,date,ls_number,ministry,question_type,question_text,answer_text,member,party,state,...,gender_change,classification_results,top_category,top_category_score,processed_results,development_score,corruption_score,programmatic representation_score,accountability and transparency_score,clientelistic representation_score
304,17350913,1999-12-09,13,Rural Development,Unstarred,(a) whether any scheme is under consideation o...,MINISTER OF STATE FOR RURAL DEVELOPMENT (SHRI ...,ABDUL RASHID SHAHEEN,JKN,Jammu & Kashmir,...,False,"([programmatic representation, accountability ...",programmatic representation,0.769578,{'programmatic representation': 0.769578337669...,0.193197,0.018498,0.769578,0.663415,0.239772
306,17354313,1999-12-01,13,Prime Minister,Unstarred,(a) whether the Government propose to conduct ...,THE MINISTER OF STATE IN THE DEPARTMENT OF ATO...,T. GOVINDAN,CPM,Kerala,...,False,"([accountability and transparency, programmati...",accountability and transparency,0.614412,{'accountability and transparency': 0.61441200...,0.259336,0.003222,0.410504,0.614412,0.292619
320,17353813,1999-12-02,13,Rural Development,Unstarred,(a) The guidelines and directions issued by th...,Minister of State for RURAL DEVELOPMENT (SHRI ...,SAMAR CHOWDHURY,CPM,Tripura,...,False,"([programmatic representation, accountability ...",programmatic representation,0.874857,{'programmatic representation': 0.874856531620...,0.505724,0.005591,0.874857,0.855427,0.30582
324,10021313,2004-02-04,13,Shipping,Unstarred,(a) the time when the Sompen Hospital Ship arr...,MINISTER OF STATE IN THE MINISTRY OF SHIPPING ...,BISHNU PADA RAY,BJP,Andaman & Nicobar Islands,...,False,"([accountability and transparency, clientelist...",accountability and transparency,0.433297,{'accountability and transparency': 0.43329674...,0.021046,0.005672,0.187835,0.433297,0.340559
325,16344514,2004-08-16,14,Tourism,Unstarred,(a) whether the Government has any proposal to...,MINISTER OF STATE FOR TOURISM (INDEPENDENT CHA...,MANORANJAN BHAKTA,INC,Andaman & Nicobar Islands,...,False,"([development, accountability and transparency...",development,0.952534,"{'development': 0.9525343775749207, 'accountab...",0.952534,0.0073,0.682238,0.901977,0.583766


In [None]:
df_merged_final = df_merged_final.drop(columns=['programmatic_representation_score', 'accountability_and_transparency_score',
                                                'clientelistic_representation_score',
                     ])

In [None]:
df_merged_final.to_csv('dataset_final.csv', index=False)