In [None]:
# Load libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()

In [None]:
# Read the Data
train = pd.read_csv("../input/train.csv", dtype={"project_essay_3": object, "project_essay_4": object})
test = pd.read_csv("../input/test.csv", dtype={"project_essay_3": object, "project_essay_4": object})
resources = pd.read_csv("../input/resources.csv")


approvals=train[train.project_is_approved==1]
rejects=train[train.project_is_approved==0]

In [None]:
print("Size of training data : ",train.shape)
print("Size of test data : ",test.shape)
print("Size of resource data : ",resources.shape)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.describe(include=["O"])

In [None]:
resources.head()

In [None]:
resources.dtypes

In [None]:
resources.info()

In [None]:
resources.describe()

In [None]:
resources.describe(include=["O"])

In [None]:
zero_count = train[train['teacher_number_of_previously_posted_projects'] == 0]
zero_project_percentage = (float(zero_count.shape[0]) / train.shape[0]) * 100
print("Percentage of teachers with their first project: " + str(zero_project_percentage))

one_count = train[train['teacher_number_of_previously_posted_projects'] == 1]
one_count_percentage = (float(one_count.shape[0]) / train.shape[0]) * 100
print("Percentage of teachers with only one project: " + str(one_count_percentage))

more_than_one = train[train['teacher_number_of_previously_posted_projects'] > 1]
more_than_one_percentage = (float(more_than_one.shape[0]) / train.shape[0]) * 100
print("Percentage of teachers with more than one project: " + str(more_than_one_percentage))

In [None]:
plt.figure(figsize = (12, 8))

sns.distplot(train['teacher_number_of_previously_posted_projects'])
plt.title('Histogram of number of previously posted applications by the submitting teacher')
plt.xlabel('Number of previously posted applications by the submitting teacher', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(train['teacher_number_of_previously_posted_projects'], bins=[0, 10, 150, 450])
plt.title('Histogram Counting # of Teachers that Previously Posted Projects)')
plt.xlabel('Projects')
plt.ylabel('Count')
plt.show()

In [None]:
# Check to see if all teacher_id are present in the data set
# ['teacher_id'][0]
# ['teacher_id'][4]
print(len(np.where(train['teacher_id'] == train['teacher_id'][0])[0]))
print(len(np.where(train['teacher_id'] == train['teacher_id'][4])[0]))

In [None]:
project_approved = train['project_is_approved'].value_counts()
labels = project_approved.index
sizes = (project_approved / project_approved.sum())*100
trace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')
layout = go.Layout(title='Status of Project Proposal')
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
train['teacher_prefix'].value_counts()

In [None]:
temp = train["teacher_prefix"].value_counts()
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train['project_is_approved'][train["teacher_prefix"]==val] == 1))
    temp_y0.append(np.sum(train['project_is_approved'][train["teacher_prefix"]==val] == 0))    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Popular Teacher prefixes in terms of project acceptance rate and project rejection rate",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
gender_mapping = {"Mrs.": "Female", "Ms.":"Female", "Mr.":"Male", "Teacher":"Unknown", "Dr.":"Unknown", np.nan:"Unknown"  }
train["gender"] = train.teacher_prefix.map(gender_mapping)

temp = train["gender"].value_counts()
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Gender in terms of projects proposals submitted in % ",
    xaxis=dict(
        title='Gender',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of project proposals submitted in % ',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
for label in ['project_grade_category']:
    print(train[label].value_counts())

In [None]:
temp = train["project_grade_category"].value_counts()
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Distribution of project_grade_category (school grade levels) in %",
    xaxis=dict(
        title='school grade levels',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of project proposals submitted in % ',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')

In [None]:
temp = train["project_grade_category"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train['project_is_approved'][train["project_grade_category"]==val] == 1))
    temp_y0.append(np.sum(train['project_is_approved'][train["project_grade_category"]==val] == 0))    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Popular school grade levels in terms of project acceptance rate and project rejection rate",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
pd.DataFrame(train['project_is_approved'])

In [None]:
state_vals=train.groupby(['school_state'])['project_is_approved','teacher_id'].agg(
    {'project_is_approved':['sum','count'],'teacher_id':['nunique']}).reset_index()
state_vals.columns=['state','approved','total','teacher_count']
state_vals['approval_perc']=(state_vals.approved*100)/state_vals.total
state_vals['proj_per_teacher']=(state_vals.total)/state_vals.teacher_count
state_vals=state_vals.round(2)
state_vals.sort_values('total',ascending=False).head()

In [None]:
temp = train["project_subject_categories"].value_counts().head(5)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["project_subject_categories"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["project_subject_categories"]==val] == 0))    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Popular category of the project in terms of project acceptance rate and project rejection rate",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
temp = train["project_subject_subcategories"].value_counts().head(5)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["project_subject_subcategories"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["project_subject_subcategories"]==val] == 0))    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Popular category of the project in terms of project acceptance rate and project rejection rate",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
temp = train["project_title"].value_counts().head(20)
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["project_title"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["project_title"]==val] == 0))    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Popular project titles in terms of project acceptance rate and project rejection rate",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
train_data = pd.read_csv("../input/train.csv", dtype={"project_essay_3": object, "project_essay_4": object})
test_data = pd.read_csv("../input/test.csv", dtype={"project_essay_3": object, "project_essay_4": object})

train_data = pd.merge(train_data, resources, on="id", how='left')
test_data = pd.merge(test_data, resources, on="id", how='left')

In [None]:
temp = train_data.groupby('project_title')['project_is_approved'].agg(['sum', 'count'])
temp['approval_rate'] = (temp['sum']*100)/temp['count']
temp.columns = ["# of projects approved", '# of total projects', 'Approval rate']
temp = temp.sort_values(by='# of total projects', ascending=False)
temp = temp.iloc[0:20]
temp

In [None]:
import re
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text  
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text    
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text    
    temp = [s.strip() for s in text.split() if s not in STOPWORDS]# delete stopwords from text
    new_text = ''
    for i in temp:
        new_text +=i+' '
    text = new_text
    return text.strip()

In [None]:
# Top keyword in project_essay_1
train['project_essay_1'] = train['project_essay_1'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['project_essay_1'] = train['project_essay_1'].map(text_prepare)


from wordcloud import WordCloud

wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(train['project_essay_1'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in project_essay_1", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
# Top keyword in project_essay_2
train['project_essay_2'] = train['project_essay_2'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['project_essay_2'] = train['project_essay_2'].map(text_prepare)


wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(train['project_essay_2'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in project_essay_2", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
approvals['project_title'] = approvals['project_title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
approvals['project_title'] = approvals['project_title'].map(text_prepare)


from wordcloud import WordCloud

wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(approvals['project_title'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in project_title", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
rejects['project_title'] = rejects['project_title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
rejects['project_title'] = rejects['project_title'].map(text_prepare)


wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(rejects['project_title'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in project_title", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
train_data.head(1)

In [None]:
approvals_data=train_data[train_data.project_is_approved==1]
rejects_data=train_data[train_data.project_is_approved==0]

In [None]:
approvals_data['project_resource_summary'] = approvals_data['project_resource_summary'].apply(lambda x: " ".join(x.lower() for x in x.split()))
approvals_data['project_resource_summary'] = approvals_data['project_resource_summary'].map(text_prepare)


from wordcloud import WordCloud

wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(approvals_data['project_resource_summary'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in approved project_resource_summary", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
rejects_data['project_resource_summary'] = rejects_data['project_resource_summary'].apply(lambda x: " ".join(x.lower() for x in x.split()))
rejects_data['project_resource_summary'] = rejects_data['project_resource_summary'].map(text_prepare)


from wordcloud import WordCloud

wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(rejects_data['project_resource_summary'].values))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Top Keywords in Rejects project_resource_summary", fontsize=35)
plt.axis("off")
plt.show() 

In [None]:
train["project_submitted_datetime"] = pd.to_datetime(train["project_submitted_datetime"])
train["month_created"] = train["project_submitted_datetime"].dt.month
train["weekday_created"] = train["project_submitted_datetime"].dt.weekday
train["date_created"] = train["project_submitted_datetime"].dt.date
train["hour_created"] = train["project_submitted_datetime"].dt.hour

In [None]:
temp = train["month_created"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["month_created"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["month_created"]==val] == 0))
    
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Project Proposal Submission Month Distribution",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
temp = train["weekday_created"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["weekday_created"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["weekday_created"]==val] == 0))
 
temp.index = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Project Proposal Submission weekday Distribution",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
temp = train["date_created"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["date_created"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["date_created"]==val] == 0))
 
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Project Proposal Submission date Distribution",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
temp = train["hour_created"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
    temp_y1.append(np.sum(train["project_is_approved"][train["hour_created"]==val] == 1))
    temp_y0.append(np.sum(train["project_is_approved"][train["hour_created"]==val] == 0))
 
trace1 = go.Bar(
    x = temp.index,
    y = temp_y1,
    name='Accepted Proposals'
)
trace2 = go.Bar(
    x = temp.index,
    y = temp_y0, 
    name='Rejected Proposals'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Project Proposal Submission Hour Distribution",
    barmode='stack',
    width = 1000
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)