In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## Python Questions from Stack Overflow
Full text of Stack Overflow Q&A about the Python programming language.

From [Kaggle](https://www.kaggle.com/stackoverflow/pythonquestions)

In [2]:
path = '/home/caro/Desktop/cursosPlatzi/PlatziDataChallenge/PDC2'

files = os.listdir(path)
files_list = [f for f in files if f.endswith('.csv')]
files_list

['Questions.csv', 'Tags.csv']

In [3]:
df1 = pd.DataFrame() #Questions
for trozo in pd.read_csv(files_list[1], chunksize=1000, encoding = 'latin1'):
    df1 = pd.concat([df1,trozo])
df1.shape

(1885078, 2)

In [4]:
df2 = pd.DataFrame() #Tags
for trozo in pd.read_csv(files_list[0], chunksize=1000, encoding = 'latin1'):
    df2 = pd.concat([df2,trozo])
df2.shape

(607282, 6)

In [6]:
df_tot = df1.merge(df2, how = 'outer', on='Id')
df_tot.shape


(1885078, 7)

## 1. Total number of users in the Questions dataset 

In [7]:
df_tot['OwnerUserId'].nunique()

213927

## 2. Total number of Tags in the Questions dataset

In [8]:
df_tot['Tag'].nunique()

16895

## 3. Total number of Questions in the dataset

In [9]:
df_tot['Id'].nunique()

607283

## 4. Top ten of tags in the Questions dataset

In [10]:
df_tot.groupby('Tag')['Tag'].count().sort_values(ascending=False)[1:11]

Tag
django        62818
python-2.7    34616
pandas        26854
python-3.x    26814
numpy         25848
list          18951
matplotlib    16521
regex         14047
dictionary    13413
tkinter       10766
Name: Tag, dtype: int64

## 5. Top ten of Questions with highest scores

In [11]:
df2.sort_values('Score',ascending=False).head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
962,231767,18300.0,2008-10-23T22:21:11Z,5524,"What does the ""yield"" keyword do?",<p>What is the use of the <code>yield</code> k...
334,100003,9951.0,2008-09-19T06:10:46Z,3219,What is a metaclass in Python?,<p>What are metaclasses? What do you use them ...
291,82831,15616.0,2008-09-17T12:55:00Z,2729,How do I check whether a file exists using Pyt...,"<p>How do I check whether a file exists, witho..."
1874,394809,51518.0,2008-12-27T08:32:18Z,2655,Does Python have a ternary conditional operator?,<p>If Python does not have a ternary condition...
306,89228,17085.0,2008-09-18T01:35:30Z,2312,Calling an external command in Python,<p>How can I call an external command (as if I...
2018,419163,51518.0,2009-01-07T04:11:00Z,2184,"What does `if __name__ == ""__main__"":` do?","<p>What does the <code>if __name__ == ""__main_..."
4086,739654,1897.0,2009-04-11T07:05:31Z,1920,How to make a chain of function decorators in ...,<p>How can I make two decorators in Python tha...
116,38987,3207.0,2008-09-02T07:44:30Z,1867,How to merge two Python dictionaries in a sing...,"<p>I have two Python dictionaries, and I want ..."
3216,613183,2786.0,2009-03-05T00:49:05Z,1837,Sort a Python dictionary by value,<p>I have a dictionary of values read from two...
1175,273192,13055.0,2008-11-07T18:56:45Z,1739,How to check if a directory exists and create ...,<p>What is the most elegant way to check if th...


## 6. Tags of the top 10 questions with highest scores

In [12]:
df_tot[df_tot['Score']>1700].groupby('Tag')['Tag'].count().sort_values(ascending=False)

Tag
python                  10
dictionary               2
idioms                   2
yield                    1
iterator                 1
conditional-operator     1
coroutine                1
decorator                1
directory                1
exception                1
expression               1
external                 1
file                     1
filesystems              1
generator                1
main                     1
ternary-operator         1
mapping                  1
metaclass                1
module                   1
namespaces               1
oop                      1
operators                1
python-2.5               1
python-datamodel         1
python-decorators        1
shell                    1
sorting                  1
subprocess               1
command                  1
Name: Tag, dtype: int64

## 7. Top 10 of Tags with highest scores

In [13]:
df_tot.groupby('Tag')['Score'].sum().sort_values(ascending=False)[1:11]

Tag
django        116416.0
numpy          67737.0
list           60352.0
pandas         51380.0
string         46363.0
python-3.x     44465.0
python-2.7     42602.0
matplotlib     42090.0
dictionary     39003.0
pip            25354.0
Name: Score, dtype: float64

## 8. Top 10 of users with highest scores

In [14]:
df2.groupby('OwnerUserId')['Score'].sum().sort_values(ascending=False).head(10)

OwnerUserId
51816.0     7414
9951.0      5770
18300.0     5568
46646.0     5073
51518.0     4956
179736.0    4184
4872.0      4095
4766.0      3458
15055.0     3144
76701.0     3022
Name: Score, dtype: int64

## 9. Top 10 of users with lowest scores

In [15]:
df2.groupby('OwnerUserId')['Score'].sum().sort_values(ascending=True).head(10)

OwnerUserId
2955338.0   -44
4674272.0   -34
4212189.0   -21
5194936.0   -18
4864486.0   -18
5476930.0   -17
5430566.0   -17
6014650.0   -17
6288320.0   -16
3968268.0   -16
Name: Score, dtype: int64

## 10. Top 10 of users with more Questions

In [16]:
df2.groupby('OwnerUserId')['OwnerUserId'].count().sort_values(ascending=False).head(10)

OwnerUserId
651174.0     643
179736.0     489
283296.0     375
3371056.0    354
610569.0     348
1107049.0    344
308827.0     341
578822.0     314
248237.0     294
1592380.0    291
Name: OwnerUserId, dtype: int64