In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer

In [2]:
# loading the questions dataset
questions = pd.read_csv('500_questions.csv', encoding='iso-8859-1')
# loading the topics dataset
topics = pd.read_csv('topic_dataset.csv')

In [3]:
questions.head(10)

Unnamed: 0.1,Unnamed: 0,question
0,2254,(Avg. of 42 ) : What is a SQL view?
1,1390,Question involving Primary Key Integrity const...
2,1224,How secure are encrypted stored procedures tr...
3,1172,8. A candidate appearing for an examinatio n ...
4,1135,Consider the following areas for tuning. The o...
5,22,2. Why we cant create object for abstract class?
6,1166,6.transfering of pages from memory to peripher...
7,1693,There are 30 questions in which 2 are STAR QUE...
8,994,A) InfoPackage â ' Scheduler â ' Repair Reques...
9,374,An anchoring object is a print condition objec...


In [4]:
topics.head()

Unnamed: 0.1,Unnamed: 0,topic,subject
0,1115,fcfs,Operating System
1,1923,dml,Database Management System
2,835,jdbc,Object_oriented_programing
3,1866,joints,Database Management System
4,1125,buffering,Operating System


In [5]:
# creating topic related words to categorize the questions
related_words = {'fcfs':['fcfs'], 'dml':['dml'], 'jdbc':['jdbc', 'connectivity'], 'joints':['joint', 'join', 'joined'],
                'buffering':['buffering', 'buffer', 'buffered'], 'applet':['applet'],
                'deadlock':['deadlock', 'deadlocking', 'deadlocked'], 'ddl':['ddl'],
                'polymorphism':['polymorphism', 'polymorhic'], 'caching':['caching', 'cached', 'cache'],
                'thread':['thread', 'threading', 'threaded'], 'security':['security', 'secure', 'secured'], 
                'scheduling':['scheduling', 'scheduled', 'schedule', 'scheduler'],
                'Inheritance':['inheritance', 'inherited', 'inherit', 'inherits'], 'encapsulation':['encapsulation', 'encapsule', 'encapsuled', 'encapsulate'],
                'transaction':['transaction', 'transact', 'transacted', 'transactional'], 'Object':['object'], 'sql':['sql', 'view'],
                'keys':['key']}

In [6]:
lemmatizer = WordNetLemmatizer()
questions['topic'] = [set() for _ in range(len(questions))]

# categorizing each question by going through each word
for i in range(len(questions)):
    for word_fromQ in re.split(r'[^\w]', questions['question'][i].lower()):
        for key, word_list in related_words.items():
            if lemmatizer.lemmatize(word_fromQ) in word_list:
                questions['topic'][i].add(key)

In [7]:
# questions having no category
print((questions['topic']==set()).sum())

25


In [8]:
# sorting the questions based on the frequency
questions = questions.sort_values(['Unnamed: 0'], ascending=[False], axis=0)

In [9]:
questions.head()

Unnamed: 0.1,Unnamed: 0,question,topic
430,2421,A few SQL queries were also asked (find second...,{sql}
51,2420,The other 25 questions were technical question...,"{sql, Object}"
213,2418,The panel was too helpful and friendly. They t...,{sql}
63,2416,The interview was quite easy. Most of the ques...,"{sql, joints}"
127,2415,For IS/CS DBMS basic knowledge SQL queries is...,"{polymorphism, sql, Inheritance}"


In [10]:
count = dict(zip(topics.topic.tolist(), [0 for _ in range(19)]))
# creating a new dataframe that will hold the questions topic wise
result = pd.DataFrame(index=range(120), columns = topics.topic)
result.columns.name = 'FAQs'
result.index.name = 'index'

for row in questions.itertuples():
    for top in row.topic:
        result[top].at[count[top]] = row.question 
        count[top] += 1

In [11]:
result.rename(columns={'topic':'questions'}, inplace=True)

In [12]:
result.head()

FAQs,fcfs,dml,jdbc,joints,buffering,applet,deadlock,ddl,polymorphism,caching,thread,security,scheduling,Inheritance,encapsulation,transaction,Object,sql,keys
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,? Case in which fcfs is the best algo,f) Embedded Sql :: Incorporate DDL DML and T.C...,When testing connectivity in SQL Server you u...,The interview was quite easy. Most of the ques...,The On-line Redo Log is a set of tow or more o...,3.when an java applet get's downloaded what ha...,Questions were fired one after another from de...,Can one use dynamic SQL within PL/SQL? OR Can ...,For IS/CS DBMS basic knowledge SQL queries is...,The data dictionary cache is stored in an area...,Imagine this scenario: I/O completion ports ar...,You are developing security policy for your SQ...,This is a bit tough round . The interviewer as...,For IS/CS DBMS basic knowledge SQL queries is...,The questions were very simple like about the ...,COMMIT makes permanent the changes resulting f...,The other 25 questions were technical question...,A few SQL queries were also asked (find second...,Function key triggers are associated with indi...
1,,c) Transaction Control Statements :: Manage ch...,Then he asked me to tell OOPS concepts Inheri...,I attended Virtusa written test in Hyderabad. ...,a) Data Base Writer(DBWR) :: Data Base Writer ...,27-Java applet of a moving /waving file is run...,This level of error severity indicates a trans...,f) Embedded Sql :: Incorporate DDL DML and T.C...,Then java it is quit easy. Polymorphism metho...,The data dictionary cache is stored in an area...,OS-CPU Scheduling algorithms Threads with Lif...,? See the figure numbers correctly They ask qu...,The result was declared on 10th of august and ...,One additional key difference between interfac...,TECH RND2: Threads again they told to write p...,ROLLBACK retracts any of the changes resulting...,In SQL Server 2005 which of the following sch...,The other 25 questions were technical question...,When you install SQL Server 2005 and create a ...
2,,The answer is Phantoms rows. A phantom row ref...,16. How many types of JDBC driver-,If you have good domain knowledge and concepts...,6.transfering of pages from memory to peripher...,Me : Swings Applets sir. I am also interested...,Q- What is deadlock and blocking with respect ...,Can a particular event in SQL Server 2005 suc...,2.What is Runtime Polymorphism ?,a) Data Base Writer(DBWR) :: Data Base Writer ...,many threads do u want for this?,Me: security platform independent...,A) InfoPackage â ' Scheduler â ' Repair Reques...,The questions were very simple like about the ...,He asked me diff btwn array n linked list pol...,For long transactions that contain many SQL st...,DDL triggers cannot be signed but all the oth...,The panel was too helpful and friendly. They t...,On 21st I was called for interview I just had...
3,,6. dcl dml ddl language and example.,Me: JDBC MULTITHREADING OOPS CONCEPTS.,SQL queries were from join Subqueries Clusters...,Use the buffer cache advisory over a given wor...,,Deadlock graph(2) error detection in which l...,6. dcl dml ddl language and example.,He asked me diff btwn array n linked list pol...,Use the buffer cache advisory over a given wor...,First-round which was Aptitude round has happe...,There was upper limit for cut off. Students wh...,The technical interview was scheduled on 4 Sep...,Then he asked me to tell OOPS concepts Inheri...,3) asked wht is encapsulation ..explain it til...,You can use tracer tokens in SQL Server 2005 r...,The object?s state (values)How does output cac...,The interview was quite easy. Most of the ques...,Also very imp. That only put things in your re...
4,,Q) Tell the commands of ddl dml dcl &tcl comm...,Without using strrev reversing a linked list ...,Basic sql queries. For union joins differenc...,X$KCBCBH Cache Buffer Current Buffer Heade...,,This level of error severity indicates a trans...,Q) Tell the commands of ddl dml dcl &tcl comm...,Me: Polymorphism is the ability of an object t...,X$KCBCBH Cache Buffer Current Buffer Heade...,Write a java program snippet for thread synchr...,We had to go to Polaris company for interview ...,It mainly covers Operating system. consists of...,TECH RND2: Threads again they told to write p...,- What is encapsulation in C++?,If you have good domain knowledge and concepts...,Me: Polymorphism is the ability of an object t...,For IS/CS DBMS basic knowledge SQL queries is...,The key to crack this round is also you alread...


In [13]:
result.to_csv('result.csv')