In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import pandas as pd
from env import github_token, github_username

import prepare
import acquire
#import explore
#import model

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filter="ignore"

from bs4 import BeautifulSoup
from mergedeep import merge

## Using  [Acquire.py](https://github.com/bert-jason-ray/nlp-group-project/blob/main/acquire.py) file to bring in new dataframe.

In [2]:
# define the acquired file as new data frame
df = acquire.get_github_data()
df

Unnamed: 0,repo,language,readme_contents
0,jagrosh/MusicBot,Java,"<img align=""right"" src=""https://i.imgur.com/zr..."
1,Just-Some-Bots/MusicBot,Python,# MusicBot\n\n[![GitHub stars](https://img.shi...
2,SudhanPlayz/Discord-MusicBot,JavaScript,"<h1 align=""center""><img src=""./assets/logo.gif..."
3,IVETRI/SongPlayRoBot,Python,# Check Our New Bot Repo & Video :\n\n[Video](...
4,Splamy/TS3AudioBot,C#,# TS3AudioBot\n\nThis is a open-source TeamSpe...
...,...,...,...
195,Davidremo02/LazyMusicbot,,"<h2 align=""centre"">Telegram Group Music Player..."
196,MrRizoel/RiZoeLXMusic,Python,"<h2 align=""centre"">ℝ𝚒ℤ𝚘𝚎𝕃𝕏𝕄𝚞𝚜𝚒𝚌 🎵</h2>\n\n### ..."
197,noirscape/MusicBot-2,Python,# Dynamic cog bot template\n\nThis is a bot te...
198,bhkvlldu/MusicBot,Python,# ⌯ Source zoro ˹♻️˼⁩\n\n• I am a developer in...


## Using  [Prepare.py](https://github.com/bert-jason-ray/nlp-group-project/blob/main/prepare.py) to summon Clean, Stemmed, and Lemmatized data.

In [17]:
df = prepare.prep_github_data(df,column = 'readme_contents', extra_words=[], exclude_words=[])
df

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,jagrosh/MusicBot,Java,"<img align=""right"" src=""https://i.imgur.com/zr...",img alignright srchttpsiimgurcomzre80hypng hei...,img alignright srchttpsiimgurcomzre80hypng hei...,img alignright srchttpsiimgurcomzre80hypng hei...
1,Just-Some-Bots/MusicBot,Python,# MusicBot\n\n[![GitHub stars](https://img.shi...,musicbot github starshttpsimgshieldsiogithubst...,musicbot github starshttpsimgshieldsiogithubst...,musicbot github starshttpsimgshieldsiogithubst...
2,SudhanPlayz/Discord-MusicBot,JavaScript,"<h1 align=""center""><img src=""./assets/logo.gif...",h1 aligncenterimg srcassetslogogif width30px d...,h1 aligncenterimg srcassetslogogif width30px d...,h1 aligncenterimg srcassetslogogif width30px d...
3,IVETRI/SongPlayRoBot,Python,# Check Our New Bot Repo & Video :\n\n[Video](...,check new bot repo video videohttpsyoutube3pn0...,check new bot repo video videohttpsyoutube3pn0...,check new bot repo video videohttpsyoutube3pn0...
4,Splamy/TS3AudioBot,C#,# TS3AudioBot\n\nThis is a open-source TeamSpe...,ts3audiobot opensource teamspeak3 bot playing ...,ts3audiobot opensourc teamspeak3 bot play musi...,ts3audiobot opensource teamspeak3 bot playing ...
...,...,...,...,...,...,...
192,Micium-Development/Bounce,JavaScript,### 📥 Micium-Development Is recruiting develop...,miciumdevelopment recruiting developers want j...,miciumdevelop recruit develop want join dev te...,miciumdevelopment recruiting developer want jo...
193,philliphqs/hqs.bot,Python,"<br />\n<p align=""center"">\n <a href=""https:/...",br p aligncenter hrefhttpsgithubcomphilliphqsh...,br p aligncent hrefhttpsgithubcomphilliphqshqs...,br p aligncenter hrefhttpsgithubcomphilliphqsh...
196,MrRizoel/RiZoeLXMusic,Python,"<h2 align=""centre"">ℝ𝚒ℤ𝚘𝚎𝕃𝕏𝕄𝚞𝚜𝚒𝚌 🎵</h2>\n\n### ...",h2 aligncentrerizoelxmusic h2 bot play music t...,h2 aligncentrerizoelxmus h2 bot play music tel...,h2 aligncentrerizoelxmusic h2 bot play music t...
197,noirscape/MusicBot-2,Python,# Dynamic cog bot template\n\nThis is a bot te...,dynamic cog bot template bot template discordp...,dynam cog bot templat bot templat discordpi fr...,dynamic cog bot template bot template discordp...


## Looking at the amounts of times a specific language is used.

In [25]:
df = df.drop(columns = ['readme_contents', 'stemmed','clean'])

In [26]:
df = df.dropna()

In [27]:
df

Unnamed: 0,repo,language,lemmatized
0,jagrosh/MusicBot,Java,img alignright srchttpsiimgurcomzre80hypng hei...
1,Just-Some-Bots/MusicBot,Python,musicbot github starshttpsimgshieldsiogithubst...
2,SudhanPlayz/Discord-MusicBot,JavaScript,h1 aligncenterimg srcassetslogogif width30px d...
3,IVETRI/SongPlayRoBot,Python,check new bot repo video videohttpsyoutube3pn0...
4,Splamy/TS3AudioBot,C#,ts3audiobot opensource teamspeak3 bot playing ...
...,...,...,...
192,Micium-Development/Bounce,JavaScript,miciumdevelopment recruiting developer want jo...
193,philliphqs/hqs.bot,Python,br p aligncenter hrefhttpsgithubcomphilliphqsh...
196,MrRizoel/RiZoeLXMusic,Python,h2 aligncentrerizoelxmusic h2 bot play music t...
197,noirscape/MusicBot-2,Python,dynamic cog bot template bot template discordp...


# `Splitting Data`

In [28]:
def nlp_X_train_split(X_data, y_data):
    '''
    This function is designed for splitting data during an NLP pipeline
    It takes in the X_data (already transformed by your Vectorizer)
    y_data (target)
    And performs a train validate test X/y split (FOR MODELING NOT EXPLORATION)
    This is a one shot for doing train validate test and x/y split in one go
    
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    
    Returns 6 dfs: X_train, y_train, X_validate, y_validate, X_test, y_test
    '''
    X_train_validate, X_test, y_train_validate, y_test = train_test_split(X_data, y_data,
                                                                          test_size=.2, random_state=123)
    
    X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, 
                                                                test_size=.3, 
                                                                random_state=123)
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [29]:
X_train, y_train, X_validate, y_validate, X_test, y_test = nlp_X_train_split(df,df.language)

In [30]:
X_train

Unnamed: 0,repo,language,lemmatized
39,BluSpring/discord.js-lavalink-musicbot,JavaScript,discordjslavalinkmusicbot lavalink music bot m...
107,iamsurojit/Discord-MusicBot,JavaScript,musicbot advanced discord lavalink music bot
118,wvffle/ts3-musicbot,Python,ts3musicbot python script creates command inte...
137,kaoru-nk/atmusicbot,JavaScript,musicbot discordmusicbot npm install cp config...
22,parasop/NEW-MUSIC-BOT,JavaScript,best music bot advance stable music bot featur...
...,...,...,...
149,Ryuukai/discord-musicbot,JavaScript,discordmusicbot musicbot discord made nodejs r...
59,MR-INVISIBLEBOY/LEGENDBOT-INVISIBLE1,HTML,h2 aligncenterb personal legendbotliinformatio...
96,TimovNiedek/MusicBot,Python,musicbot telegram chatbot recommending music b...
191,ItsClairton/Anny,Go,h1 aligncenteranny simple bot discordh1 p alig...


In [31]:
y_train

39     JavaScript
107    JavaScript
118        Python
137    JavaScript
22     JavaScript
          ...    
149    JavaScript
59           HTML
96         Python
191            Go
108    JavaScript
Name: language, Length: 100, dtype: object

In [8]:
word_counts = (pd.concat([all_freq, JavaScript_freq, Python_freq, Java_prep, ], axis=1, sort=True)
                .set_axis(['all', 'JavaScript', 'Python','Java','TypeScript','Go','Kotlin'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.head()

NameError: name 'all_freq' is not defined

In [None]:
pd.concat([word_counts[word_counts.JavaScript == 0].sort_values(by='JavaScript').tail(6),
           word_counts[word_counts.Python == 0].sort_values(by='Python').tail(6)])

In [None]:
# figure out the percentage of spam vs ham
(word_counts
 .assign(P_JavaScript=word_counts.JavaScript / word_counts['all'],
         p_Python=word_counts.Python / word_counts['all'])
 .sort_values(by='all')
 [['JavaScript', 'Python']]
 .tail(20)
 .sort_values('JavaScript')
 .plot.barh(stacked=True))

plt.title('Proportion of JavaScript vs python for the 20 most common words')

`Takeaways`**:**
 - **JavaScript & Python seem to dominate, while Java & TypeScript and all others seem to be less than 13.**