# github_readme_nlp

In [1]:
# Setup environment
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
import time
import pandas as pd
import numpy as np
import unicodedata
import re
import json
import os
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

# matplotlib default plotting styles
plt.rc("patch", edgecolor="black", force_edgecolor=True)
plt.rc("axes", grid=True)
plt.rc("grid", linestyle=":", linewidth=0.8, alpha=0.7)
plt.rc("axes.spines", right=False, top=False)
plt.rc("figure", figsize=(11, 10))
plt.rc("font", size=12.0)
plt.rc("hist", bins=25)

import warnings
warnings.filterwarnings('ignore')

import acquire
import prepare

## Acquire/Prepare

**Task** show steps to acquire and prepare

- original, cleaned, stem, lemmatize notes

In [2]:
df = prepare.prep()

In [3]:
df.()

SyntaxError: invalid syntax (<ipython-input-3-c39d090ae890>, line 1)

## Prep Summary

- Create data frame with cleaned data and language listed.

In [None]:
lemmas_df = df[['language', 'title', 'lemmatized']]

## Explore & Define Features

### Category Distribution

In [None]:
lemmas_df.dropna(inplace=True)

In [None]:
lemmas_df.shape

In [None]:
lemmas_df.language.value_counts(normalize = True)

In [None]:
df.language.value_counts().plot.pie(
    colors = ['pink', 'lightblue', 'green', 'orange'], autopct = '%.0f%%')
plt.title("Language Distribution")
plt.ylabel("")
plt.xlabel('n = %d' % df.shape[0])

pd.concat(
    [df.language.value_counts(), df.language.value_counts(normalize = True)], axis = 1).set_axis(["n", "percent"], axis = 1, inplace = False)

**Note:** join all Readme and find high count of all words

In [None]:
pd.Series(" ".join(lemmas_df.lemmatized).split()).value_counts().head(10)

**Note:** words within each language

In [None]:
all_words = ' '.join(lemmas_df.lemmatized).split()

python_words = ' '.join(lemmas_df[lemmas_df.language == 'Python'].lemmatized).split()

javascript_words = ' '.join(lemmas_df[lemmas_df.language == 'JavaScript'].lemmatized).split()

php_words = ' '.join(lemmas_df[lemmas_df.language == 'PHP'].lemmatized).split()

shell_words = ' '.join(lemmas_df[lemmas_df.language == 'Shell'].lemmatized).split()

- Once we have a list of words, we can transform it into a pandas Series, which we can then use to show us how often each of the words occurs.

In [None]:
all_freq = pd.Series(all_words).value_counts()

python_words_freq = pd.Series(python_words).value_counts()

javascript_words_freq = pd.Series(javascript_words).value_counts()

php_words_freq = pd.Series(php_words).value_counts()

shell_words_freq = pd.Series(shell_words).value_counts()

### Term Frequency

Generally, when you hear 'Term Frequency', it is referring to the number of documents in which a word appears. When we move on to computing TF-IDF, this basic definition of Term Frequency applies.

However, term frequency can be calculated in a number of ways, all of which reflect how frequently a word appears in a document.

Raw Count: This is simply the count of the number of occurances of each word.
Frequency: The number of times each word appears divided by the total number of words.
Augmented Frequency: The frequency of each word divided by the maximum frequency. This can help prevent bias towards larger documents.

In [None]:
raw_count = (
    pd.concat([all_freq, python_words_freq, javascript_words_freq, php_words_freq, shell_words_freq], axis=1, sort=True)
    .rename(columns={0: "All", 1: "Python", 2: "JavaScript", 3: "PHP", 4: "Shell"})
    .fillna(0)
    .apply(lambda col: col.astype(int))
)

In [None]:
raw_count.T

**Task** Find record that has &#9

In [None]:
raw_count.sort_values(by="All").tail(10)

- Most common words overall?
- Most common Python, JavaScript, PHP, Shell words?
- Any words that uniquely Python, JavaScript, PHP, Shell words?

In [None]:
raw_count.sort_values(by='All')['All'].tail(10).plot.barh(width=.9)
plt.title('10 most common words')

In [None]:
raw_count.sort_values(by='Python').Python.tail(10).plot.barh(width=1, color='Orange')
plt.title('10 most common Python words')

In [None]:
raw_count.sort_values(by='JavaScript').JavaScript.tail(10).plot.barh(width=1, color='lightblue')
plt.title('What are the most common JavaScript words')

In [None]:
raw_count.sort_values(by='PHP').PHP.tail(10).plot.barh(width=1, color='pink')
plt.title('What are the most common PHP words')

In [None]:
raw_count.sort_values(by='Shell').Shell.tail(10).plot.barh(width=1, color='green')
plt.title('What are the most common Shell words')

In [None]:
raw_count.head()

In [None]:
tf_df = raw_count[['All']];
tf_df = tf_df.rename(columns={'All': 'raw_count'})
tf_df = tf_df.assign(frequency = lambda lemmas_df: lemmas_df.raw_count / lemmas_df.raw_count.sum())
tf_df = tf_df.assign(augmented_frequency = lambda lemmas_df: lemmas_df.frequency / lemmas_df.frequency.max())
tf_df.frequency.max()

tf_df.sample()


### Ngrams

- Bigrams and visualizations of most frequent for all, Python, JavaScript, PHP, and Shell languages.

In [None]:
pd.Series(nltk.bigrams(all_words)).value_counts().head(15).plot.barh(width=.95).set_title('Bigrams for All Words')

In [None]:
pd.Series(nltk.bigrams(python_words)).value_counts().head(15).plot.barh(width=.95, color='orange').set_title('Bigrams for Python Words')

In [None]:
pd.Series(nltk.bigrams(javascript_words)).value_counts().head(15).plot.barh(width=.95, color='lightblue').set_title('Bigrams for JavaScript Words')

In [None]:
pd.Series(nltk.bigrams(php_words)).value_counts().head(15).plot.barh(width=.95, color='pink').set_title('Bigrams for PHP Words')

In [None]:
pd.Series(nltk.bigrams(shell_words)).value_counts().head(15).plot.barh(width=.95, color='green').set_title('Bigrams for Shell Words')

- Trigrams and visualizations of most frequent for all, Python, JavaScript, PHP, and Shell languages.

In [None]:
pd.Series(nltk.ngrams(all_words, 3)).value_counts().head(15).plot.barh(width=.95).set_title('Trigrams for All Words')

In [None]:
pd.Series(nltk.ngrams(python_words, 3)).value_counts().head(15).plot.barh(width=.95).set_title('Trigrams for Python Words')

In [None]:
pd.Series(nltk.ngrams(javascript_words, 3)).value_counts().head(15).plot.barh(width=.95).set_title('Trigrams for JavaScript Words')

In [None]:
pd.Series(nltk.ngrams(php_words, 3)).value_counts().head(15).plot.barh(width=.95).set_title('Trigrams for PHP Words')

In [None]:
pd.Series(nltk.ngrams(shell_words, 3)).value_counts().head(15).plot.barh(width=.95).set_title('Trigrams for Shell Words')

### Document Length

In [None]:
lemmas_df["length"] = lemmas_df.lemmatized.apply(len)
lemmas_df.drop(index = lemmas_df[lemmas_df.length > 140000].index, inplace = True)

In [None]:
lemmas_df.length.plot.hist().set_title('Document Length')

In [None]:
ar = lemmas_df.hist("length", by="language", sharex=True, sharey = True, layout=(2, 2), bins=15, figsize=(12, 9))
plt.suptitle('Distribution of Length of Characters for All Languages')

for ax in ar.flatten():
    ax.set_xlabel('Document Length in Characters')
    ax.set_ylabel('Number of Documents')
    ax.set

In [None]:
fig = plt.figure(figsize=(12, 7))

ax = sns.boxplot(data=lemmas_df, y="length", x="language")
ax.set_title('Document Length in Number of Characters')
ax.set_ylabel('Language')
ax.margins(.005) 
ax.set_xlabel('Length in Characters')

### Number of Words

In [None]:
lemmas_df["n_words"] = lemmas_df.lemmatized.str.count(r"\w+")

In [None]:
lemmas_df.groupby("language").n_words.describe()

In [None]:
lemmas_df.n_words.plot.hist(bins = 25).set_title('Document Length in Number of Words')

In [None]:
ar = lemmas_df.hist("n_words", by="language", sharex=True, sharey = True, layout=(2, 2), bins=15, figsize=(12, 9))
plt.suptitle("Distribution of Number of Words for All Languages")

for ax in ar.flatten():
    ax.set_xlabel('Document Length in Number of Words')
    ax.set_ylabel('Number of Documents')

### Word Cloud

- WordCloud with all words

In [None]:
corpus = ' '.join(all_words)

plt.figure(figsize=(12,12))
img = WordCloud(background_color="white", height=1000, width=1500, random_state=123).generate(corpus)
plt.imshow(img)
plt.axis("off")

- WordCloud with all words by languages

In [None]:
python_corpus = ' '.join(python_words)
javascript_corpus = ' '.join(javascript_words)
php_corpus = ' '.join(php_words)
shell_corpus = ' '.join(shell_words)

In [None]:
python_cloud = WordCloud(background_color="white", height=600, width=800).generate(
    " ".join(python_words)
)
javascript_cloud = WordCloud(background_color="white", height=600, width=800).generate(
    " ".join(javascript_words)
)
php_cloud = WordCloud(background_color="white", height=600, width=800).generate(
    " ".join(php_words)
)
shell_cloud = WordCloud(background_color="white", height=600, width=800).generate(
    " ".join(shell_words)
)

plt.figure(figsize=(10, 8))
axs = [
    plt.axes([0, 0, 0.5, 0.5]),
    plt.axes([0, 0.55, 0.5, .5]),
    plt.axes([0.6, 0.55, 0.5, 0.5]),
    plt.axes([0.6, 0, 0.5, 0.5]),
]

axs[0].imshow(python_cloud)
axs[1].imshow(javascript_cloud)
axs[2].imshow(php_cloud)
axs[3].imshow(shell_cloud)

axs[0].set_title("Python")
axs[1].set_title("JavaScript")
axs[2].set_title("PHP")
axs[3].set_title("Shell")

for ax in axs:
    ax.axis("off")    

### Word Cloud with Bigrams

In [None]:
all_bigrams = pd.Series(nltk.bigrams(all_words)).value_counts().head(15)

all_data = {p1 + " " + p2: v for (p1, p2), v in all_bigrams.to_dict().items()}

img = WordCloud(background_color="white", height=600, width=800).generate_from_frequencies(all_data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Bigrams Word Cloud with All Words')

In [None]:
python_bigrams = pd.Series(nltk.bigrams(python_words)).value_counts().head(15)

python_data = {p1 + " " + p2: v for (p1, p2), v in python_bigrams.to_dict().items()}

img = WordCloud(background_color="white", height=600, width=800).generate_from_frequencies(python_data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Bigrams Word Cloud with Python Words')

In [None]:
javascript_bigrams = pd.Series(nltk.bigrams(javascript_words)).value_counts().head(15)

javascript_data = {p1 + " " + p2: v for (p1, p2), v in javascript_bigrams.to_dict().items()}

img = WordCloud(background_color="white", height=600, width=800).generate_from_frequencies(javascript_data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Bigrams Word Cloud with JavaScript Words')

In [None]:
php_bigrams = pd.Series(nltk.bigrams(php_words)).value_counts().head(15)

php_data = {p1 + " " + p2: v for (p1, p2), v in php_bigrams.to_dict().items()}

img = WordCloud(background_color="white", height=600, width=800).generate_from_frequencies(php_data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Bigrams Word Cloud with PHP Words')

In [None]:
shell_bigrams = pd.Series(nltk.bigrams(shell_words)).value_counts().head(15)

shell_data = {p1 + " " + p2: v for (p1, p2), v in shell_bigrams.to_dict().items()}

img = WordCloud(background_color="white", height=600, width=800).generate_from_frequencies(shell_data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Bigrams Word Cloud with Shell Words')

### Word Cloud with Trigrams

In [None]:
all_trigrams = pd.Series(nltk.ngrams(all_words, 3)).value_counts().head(15)

data = {p1 + " " + p2 + " " + p3: v for (p1, p2, p3), v in all_trigrams.to_dict().items()}

frequencies = data

img = WordCloud(background_color="white", height=500, width=800).generate_from_frequencies(data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Trigrams with All Words')

In [None]:
python_trigrams = pd.Series(nltk.ngrams(python_words, 3)).value_counts().head(15)

data = {p1 + " " + p2 + " " + p3: v for (p1, p2, p3), v in python_trigrams.to_dict().items()}

frequencies = data

img = WordCloud(background_color="white", height=500, width=800).generate_from_frequencies(data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Trigrams with Python Words')

In [None]:
javascript_trigrams = pd.Series(nltk.ngrams(javascript_words, 3)).value_counts().head(15)

data = {p1 + " " + p2 + " " + p3: v for (p1, p2, p3), v in javascript_trigrams.to_dict().items()}

frequencies = data

img = WordCloud(background_color="white", height=500, width=800).generate_from_frequencies(data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Trigrams with JavaScript Words')

In [None]:
php_trigrams = pd.Series(nltk.ngrams(php_words, 3)).value_counts().head(15)

data = {p1 + " " + p2 + " " + p3: v for (p1, p2, p3), v in php_trigrams.to_dict().items()}

frequencies = data

img = WordCloud(background_color="white", height=500, width=800).generate_from_frequencies(data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Trigrams with PHP Words')

In [None]:
shell_trigrams = pd.Series(nltk.ngrams(shell_words, 3)).value_counts().head(15)

data = {p1 + " " + p2 + " " + p3: v for (p1, p2, p3), v in shell_trigrams.to_dict().items()}

frequencies = data

img = WordCloud(background_color="white", height=500, width=800).generate_from_frequencies(data)
plt.figure(figsize=(9, 6))
plt.imshow(img)
plt.axis("off")
plt.title('Top 15 Trigrams with Shell Words')