In [1]:
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy
from scipy.sparse.linalg import svds
from tqdm import tqdm
import re
import xml
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string

In [2]:
def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

# Read data

In [3]:
questions =  pd.read_csv("input/questions.csv")
professionals = pd.read_csv("input/professionals.csv")
answers = pd.read_csv("input/answers.csv")

In [4]:
questions.columns

Index(['questions_id', 'questions_author_id', 'questions_date_added',
       'questions_title', 'questions_body'],
      dtype='object')

In [5]:
questions.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26 UTC+0000,Teacher career question,What is a maths teacher? what is a ma...
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25 UTC+0000,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38 UTC+0000,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32 UTC+0000,To become a specialist in business management...,i hear business management is a hard way to ge...
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54 UTC+0000,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...


In [6]:
answers.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14 UTC+0000,<p>Hi!</p>\n<p>You are asking a very interesti...
1,ada720538c014e9b8a6dceed09385ee3,2aa47af241bf42a4b874c453f0381bd4,eb80205482e4424cad8f16bc25aa2d9c,2018-05-01 14:19:08 UTC+0000,<p>Hi. I joined the Army after I attended coll...
2,eaa66ef919bc408ab5296237440e323f,cbd8f30613a849bf918aed5c010340be,eb80205482e4424cad8f16bc25aa2d9c,2018-05-02 02:41:02 UTC+0000,"<p>Dear Priyanka,</p><p>Greetings! I have answ..."
3,1a6b3749d391486c9e371fbd1e605014,7e72a630c303442ba92ff00e8ea451df,4ec31632938a40b98909416bdd0decff,2017-05-10 19:00:47 UTC+0000,<p>I work for a global company who values high...
4,5229c514000446d582050f89ebd4e184,17802d94699140b0a0d2995f30c034c6,2f6a9a99d9b24e5baa50d40d0ba50a75,2017-10-13 22:07:33 UTC+0000,I agree with Denise. Every single job I've had...


In [7]:
answers.columns

Index(['answers_id', 'answers_author_id', 'answers_question_id',
       'answers_date_added', 'answers_body'],
      dtype='object')

In [8]:
professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19 UTC+0000
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21 UTC+0000
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26 UTC+0000
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29 UTC+0000
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44 UTC+0000


In [9]:
professionals.columns

Index(['professionals_id', 'professionals_location', 'professionals_industry',
       'professionals_headline', 'professionals_date_joined'],
      dtype='object')

In [10]:
prof_ans = pd.merge(professionals, answers, how = 'left' ,
                    left_on = 'professionals_id', right_on = 'answers_author_id')
prof_ans_q = pd.merge(prof_ans, questions, how = 'left' ,
                      left_on = 'answers_question_id', right_on = 'questions_id')

In [11]:
prof_ans_q.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19 UTC+0000,7640a6e5d5224c8681cc58de860858f4,9ced4ce7519049c0944147afb75a8ce3,f6b9ca94aed04ba28256492708e74f60,2011-10-05 20:42:09 UTC+0000,<p>Basically three things: </p>\n<ol>\n<li>Big...,f6b9ca94aed04ba28256492708e74f60,05444a2f42454327b2ac4b463c0adbe0,2011-09-27 15:26:19 UTC+0000,What do top tier consulting firms look for in ...,Please explain the factors consulting firms lo...
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21 UTC+0000,,,,,,,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26 UTC+0000,f18d7fca363d4b21a81e6683c5a86b96,0c673e046d824ec0ad0ebe012a0673e4,e214acfbe6644d65b889a3268828db9d,2012-10-01 04:35:42 UTC+0000,"<html><head></head><body><p>Hi Deja,</p>\n<p>K...",e214acfbe6644d65b889a3268828db9d,16908136951a48ed942738822cedd5c2,2012-09-09 05:33:25 UTC+0000,what does it take to be an anesthesiologist?,I am a sophomore who is interested in learning...
3,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26 UTC+0000,5d670d5f8700402ab56bae609b06d02d,0c673e046d824ec0ad0ebe012a0673e4,9d211b99e17c46fbbaca03dc6b43f1c4,2012-10-01 04:55:03 UTC+0000,<html><head></head><body><p>It looks like this...,9d211b99e17c46fbbaca03dc6b43f1c4,16908136951a48ed942738822cedd5c2,2012-09-09 00:43:18 UTC+0000,what are some of the ups and downs of being a ...,I am a sophomore who is interested in learning...
4,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26 UTC+0000,6d823a750e294c75b10fafbbbaf19855,0c673e046d824ec0ad0ebe012a0673e4,e978b437413048c183e3cb556f90a878,2012-02-13 16:44:10 UTC+0000,<p><em>[Posted on behalf of a CareerVillage Pr...,e978b437413048c183e3cb556f90a878,177f482adc1a449284471f7c556ee8f0,2012-02-13 13:45:52 UTC+0000,What do I have to do to become an investor?,<p>I would like to invest in other companies w...


In [12]:
prof_ans_q = prof_ans_q[(~prof_ans_q["questions_title"].isna()) | (~prof_ans_q["questions_body"].isna()) ]

In [13]:
len(prof_ans_q)

50106

# Questions

In [14]:
q = prof_ans_q["questions_title"] + " " + prof_ans_q["questions_body"]
q  = q.apply(lambda x:clean_text(x))
len(q)

50106

# TF - IDF

In [15]:
N_FEATURES = 1000
MAX_DF     = 0.95
MIN_DF     = 2
LANGUAGE   = 'english'

> max_df  : float or int, default=1.0            

> When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float in range [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.          

> min_df float or int, default=1       

> When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [16]:
tfidf_vectorizer = TfidfVectorizer(stop_words=LANGUAGE,max_df = MAX_DF, min_df = MIN_DF)

In [17]:
q = q.dropna()
tfidf_vectorizer.fit(q)
q_tfidf = tfidf_vectorizer.transform((q))
q_tfidf.shape

(50106, 18031)

# Save the TFIDF vectors

In [18]:
import pickle
pickle.dump(tfidf_vectorizer,open("tfidf_vectorizer.pkl","wb"))
pickle.dump(q_tfidf,open("q_tfidf.pkl","wb"))

# New Question

In [19]:
q_new = "I want to be a data scientist. What should I study"
q_new = [q_new]


with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer2 = pickle.load(f)
with open('q_tfidf.pkl', 'rb') as f:
    q_tfidf2 = pickle.load(f)

In [20]:
q_new_tfidf = tfidf_vectorizer2.transform(q_new)

# Cosine Similarity

In [21]:
cosine_similarity(q_new_tfidf,q_tfidf2).shape

(1, 50106)

In [22]:
result = cosine_similarity(q_new_tfidf,q_tfidf2)
result.argmax()

48865

In [23]:
np.argsort(result)[:,-10:]

array([[29952, 40984, 36787, 23674, 23168, 37539, 41715, 35730, 49181,
        48865]], dtype=int64)

In [24]:
result.shape

(1, 50106)

In [25]:
result.argmax()

48865

# Best Question

In [26]:
q.iloc[result.argmax()] , result[:,result.argmax()]

('what should i learn to be a data scientist i want to be a data scientist  what online courses should i take  datascience\r',
 array([0.78548831]))

# Best Answer

In [27]:
prof_ans_q.iloc[result.argmax()]["answers_body"]

'<p> </p><p>Hello Chong G.</p><p> </p><p>I am not a data scientist, but I think I can give you some advice on this. Nowadays, an increasing number of professions are requiring analytics capabilities.</p><p> </p><p> </p><p>There are some core things you should learn to handle great amount of data, like:</p><p>&nbsp;</p><p>Relational Database concepts;</p><p>SQL - Computer language for creating and managing databases;</p><p>Excel;</p><p>Programing languages such as C, VBA, R...&nbsp;</p><p><br></p><p>You should also consider learning how to display the data in an organized way and Power BI / Think-Cell are great for that</p><p>  </p><p>There are several tutorials around the internet about those topics and also focused courses. I personally recommend the latter, because it is easier to progress through the topics.</p><p><br></p><p>Hope my advice was helpful to you!</p><p> </p>'

# Best Answer

In [28]:
prof_ans_q.iloc[np.argsort(result)[:,-10:][0][9]]["answers_body"]

'<p> </p><p>Hello Chong G.</p><p> </p><p>I am not a data scientist, but I think I can give you some advice on this. Nowadays, an increasing number of professions are requiring analytics capabilities.</p><p> </p><p> </p><p>There are some core things you should learn to handle great amount of data, like:</p><p>&nbsp;</p><p>Relational Database concepts;</p><p>SQL - Computer language for creating and managing databases;</p><p>Excel;</p><p>Programing languages such as C, VBA, R...&nbsp;</p><p><br></p><p>You should also consider learning how to display the data in an organized way and Power BI / Think-Cell are great for that</p><p>  </p><p>There are several tutorials around the internet about those topics and also focused courses. I personally recommend the latter, because it is easier to progress through the topics.</p><p><br></p><p>Hope my advice was helpful to you!</p><p> </p>'

# 2nd Best Answer

In [29]:
prof_ans_q.iloc[np.argsort(result)[:,-10:][0][8]]["answers_body"]

'<p>You should search for Algorithm videos. Usually when studying data, you would need to know about databases structure, analytics skills, and some other logics. Another thing you could do would be start analyzing some small real cases like how long does it take to go from your house to the supermarket and what you could do to reduce the time? or how often do you drink water (time gap between each occurence). How could you track that? and how could you improve it? is it good?</p><p>these are a few examples on how you could analyze stuff.</p>'

# 3rd Best Answer

In [30]:
prof_ans_q.iloc[np.argsort(result)[:,-10:][0][7]]["answers_body"]

"<p>Hi Yingyi,</p><p><br></p><p>Great question and excited to see you're interested in Data Science. Data Science is a not really a unified field and a bit buzzwordy. Data scientist come from all sorts of backgrounds (computer science, political science, physics, statistics, even Creative Writing [me!]) and work on all sorts of problems. One one hand you'll may have a data scientists working with advertisement data trying to predict target audiences and click through rates and on the other a data scientist using deep learning to analyze MRI scans for cancerous tumors. And all sorts of things in between. In my role as a Data Scientist at Talla, I research machine learning and deep learning technique to solve natural language problems like teaching a computer to read text, answer questions, and classify large corpuses of data. </p><p><br></p><p> You shouldn't worry about not having a Data Science major, your double major cover the majority of the baseline knowledge you'll need. Here's a 