In [None]:
# set path
import os
import pandas as pd
import csv; import numpy as np
import re

path = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/'
os.chdir(path) # change directory

# load in data

# training data
okgo = pd.read_csv('data/OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('data/trump.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python')
swift = pd.read_csv('data/TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python')
royal = pd.read_csv('data/RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('data/LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python')
blogs = pd.read_csv('data/Kagel.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

# test data:
#trump = pd.read_csv('data/trump.csv', delimiter="@@@", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python')
# combine training dataframes
df = pd.read_csv('data/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python')

# clean dataframes
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()

def fix_cols(DF):
    DF = DF.iloc[:,:2]
    DF.columns = ["label", "comment"]
    return DF

okgo = fix_cols(okgo)
trump = fix_cols(trump)
swift = fix_cols(swift)
royal = fix_cols(royal)
paul = fix_cols(paul)
data = fix_cols(data)
tweets = fix_cols(tweets)

tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

videos = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
data = videos.copy()

df.columns = ["comment", "label"]

#DataList = [videos, full, videos_not_royal, videos_not_okgo]
#excluded = [okgo, royal]

# clean up textual data (remove symbols)
def AsStr(DF):
    DF["comment"]= DF["comment"].astype(str)

AsStr(data)
AsStr(df)

def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)

def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

cleanerFn(df)
cleanerFn2(data)

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

def nlpFunction(a):
    a['com_token']=a['comment'].str.lower().str.split()
    a['com_remv']=a['com_token'].apply(lambda x: [y for y in x if y not in sw])
    a["com_lemma"] = a['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    a['com_stem']=a['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    a["com_stem_str"] = a["com_stem"].apply(', '.join)
    return a

df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

tfidf = TfidfVectorizer()
xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform test data from fitted transformer
xuser = tfidf.transform(X_user)
data_trans= tfidf.transform(data["com_stem_str"]) # transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"])

'''X_train, X_test, Y_train, Y_test = train_test_split(
                                    df["com_stem_str"], df["label"],
                                    test_size=0.25,
                                    random_state=42)'''


# running models
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method
from sklearn.ensemble import RandomForestClassifier

rs = 10
lr = LogisticRegression(solver='sag', max_iter=100, random_state=rs, multi_class="multinomial")
mnb = MultinomialNB()
svm = svm.SVC()
rf = RandomForestClassifier(n_estimators=10, random_state=rs)
knn = KNeighborsClassifier()

models = ['lr', 'mnb', 'svm', 'rf', 'knn']
labels = ['label_' + str(models[i]) for i in range(0,len(models))]
predictions = [str(models[i])+"_predict" for i in range(0,len(models))]
d = {}
initModels = [lr, mnb, svm, rf, knn]

for i in range(0,5):
    initModels[i].fit(xtrain, Y_train)
    d[predictions[i]] = initModels[i].predict(xuser)

    # Create table of prediction accuracy rates
Table = pd.DataFrame(columns=['comment', 'label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'])
for i in range(0, len(models)):
    Table[labels[i]] = d[predictions[i]]
Table["comment"] = df["comment"]

# Create table of predicted sentiment ratios
Ratios = pd.DataFrame(columns=['label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'],
    index=range(0,3))
def RatioFinder(model):
    pos = Table[Table[model]== 1.0]
    neg = Table[Table[model]== -1.0]
    neu = Table[Table[model]== 0.0]

    pos_len = len(pos); neg_len = len(neg); neu_len = len(neu)
    total = pos_len + neg_len + neu_len

    neg_ratio = round(neg_len / float(total), 2) * 100
    pos_ratio = round(pos_len / float(total), 2) * 100
    neu_ratio = round(neu_len / float(total), 2) * 100

    ratios = [pos_ratio, neu_ratio, neg_ratio]
    return ratios

for i in range(0,3):
        for j in range(0,5):
            Ratios.iloc[i,j] = RatioFinder(labels[j])[i]

all_models = pd.DataFrame(columns=['average'], index=range(0,3))
all_models["average"]= Ratios.mean(axis=1)

# set the prediction to the mode of the row
Table["Prediction"] = 0
Table["Prediction"] = Table[['label_lr','label_mnb','label_svm','label_rf','label_knn']].mode(axis=1)
df.label = Table["Prediction"]

# extracting comments for each label
df["com_remv"] = df["com_remv"].apply(', '.join)
df["com_remv"] = df["com_remv"].str.replace(",","").astype(str)

'''df_words = df[["label","com_remv"]]
positive = df_words[df_words["label"]==1.0]
neutral = df_words[df_words["label"]==0.0]
negative = df_words[df_words["label"]==-1.0]
'''
p = df[df["label"]==1]
positive = p["com_remv"]
n = df[df["label"]==-1]
negative = n["com_remv"]
ne = df[df["label"]==0]
neutral = ne["com_remv"]

# most frequent words in each label
most_freq_pos = pd.Series(' '.join(positive).lower().split()).value_counts()[:10]
most_freq_neg = pd.Series(' '.join(negative).lower().split()).value_counts()[:10]
most_freq_neu = pd.Series(' '.join(neutral).lower().split()).value_counts()[:10]


In [None]:
import dash; import os
from dash.dependencies import Input, Output, Event
import dash_core_components as dcc
import dash_html_components as html
import plotly; import flask
import glob; import plotly.plotly as py
import plotly.graph_objs as go
import sys; import csv
import pandas as pd; import base64

path = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/'

#os.chdir(path + 'images/')
#image_filename = 'wordcloud.png' # replace with your own image
#encoded_image = base64.b64encode(open(image_filename, 'rb').read())

'''os.chdir(path + 'dash/')
data = UD.data # user loaded dataset
df = UD.df # labeled dataset
all_models = UD.all_models # table of average model results for % pos, neg, neu
Ratios = UD.Ratios # % pos, neg, neu for each model
Table = UD.Table # classification for each comment by model
'''
model_options = ['label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn']

mydict = {'label_lr': 'Logistic Regression', 'label_mnb':'Multinomial Naive Bayes',
'label_svm':'Support Vector Machine', 'label_rf': 'Random Forest', 'label_knn': 'K-Nearest Neighbor'}

#img_file = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/images/wordcloud.png'
#encoded_image = base64.b64encode(open(img_file, 'rb').read())
#image_filename = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/images/wordcloud.png' # replace with your own image
#encoded_image = base64.b64encode(open(image_filename, 'rb').read())
#image_directory = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/images/wordcloud.png'
#list_of_images = [os.path.basename(x) for x in glob.glob('{}*.png'.format(image_directory))]
#static_image_route = '/static/'

'''colors = {
    'background': 'white',
    'graph_background': 'white',
    'text': 'purple',
    'subtext': 'black',
    'blue_pal': 'lightskyblue',
    'red_pal': 'lightroal',
    'yellow_pal': 'yellowgreen',
    'grey_pal': 'lightgrey'
}'''

# colors2 = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']
#colors2 = ['#B8F7D4', '#835AF1', '#7FA6EE', '#FEBFB3']
colors3 = ["#009999", '#BFD8BD', '#9C7CA5', '#ADB2D3']
# #835AF1 dark blue
# #7FA6EE light blue
# #B8F7D4 green

# extracting comments for each label
'''positive = UD.positive
negative = UD.negative
neutral = UD.neutral

# most frequent words in each label
most_freq_pos = UD.most_freq_pos
most_freq_neg = UD.most_freq_neg
most_freq_neu = UD.most_freq_neu'''

# word frequency bar plot
Positive = go.Bar(
            x = most_freq_pos.index,
            y = most_freq_pos.values,
            name="Positive",
            marker=dict(color=colors3[0])
        )
Neutral = go.Bar(
            x = most_freq_neu.index,
            y = most_freq_neu.values,
            name="Neutral",
            marker=dict(color=colors3[1])
        )
Negative = go.Bar(
            x = most_freq_neg.index,
            y = most_freq_neg.values,
            name="Negative",
            marker=dict(color=colors3[2])
        )

updatemenus = list([

            dict(type="buttons",
                 active=-1,
                 buttons=list([
                    dict(label = 'Positive',
                         method = 'update',
                         args = [{'visible': [True, False, False]},
                                 {'title': 'Positive Comments'}]
                        ),
                    dict(label = 'Neutral',
                         method = 'update',
                         args = [{'visible': [False, True, False]},
                                 {'title': 'Neutral Comments'}]
                        ),
                    dict(label = 'Negative',
                         method = 'update',
                         args = [{'visible': [False, False, True]},
                                 {'title': 'Negative Comments'}]
                        ),
                    dict(label = 'All',
                         method = 'update',
                         args = [{'visible': [True, True, True, True]},
                                 {'title': 'All Comments'}]
                        )
                 ]),
                    pad= {'r': 15, 't': 10},
                )
        ])

def generate_table(dataframe, max_rows=10):
    return html.Table(
        [html.Tr([html.Th(col) for col in dataframe.columns])] +
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )

app = dash.Dash()

'''
-----------------------------------------------------------------
'''

'''
# Video Input Line
    dcc.Input(id='video-input', value='Enter Youtube video URL here', type='text',
        style={
                'position': 'relative',
                'width': '600px',
                'float': 'center',
                'display': 'inline-block'},
                ),

    html.Div(id='video-input-div',
        style={
                'position': 'relative',
                'width': '600px',
                'float': 'center',
                'display': 'inline-block'},
                ),
'''

app.layout = html.Div([

# Header
    html.H1(children='A YouTube Web App',
        style={
            'padding': '10px',
            'text-align': 'center',
            'font-size': '40px'}
        ),

# Pie Chart
    html.Div(
        [
            dcc.Dropdown(
                id="MyModel",
                options=[{
                    'label': mydict.get(str(i)),
                    'value': i
                } for i in model_options],
                value='All Models'),
            dcc.Graph(id='pie-graph')
        ],
            style={
                'float': 'left',
                'width': '40.00%',
                'padding': '10px 10px 10px 0px',
                'height': '300px'}
        ),

# Bar Chart; Right
    html.Div([
        dcc.Graph(
                id='bar-graph',
                figure={
                    'data': [Positive, Neutral, Negative],
                    'layout': go.Layout(title='Most Common Words', barmode='stack', showlegend=True,
                            updatemenus=updatemenus)
                        },
                style={
                'float': 'right',
                'width': '55.00%',
                'padding': '42px 0px 10px 10px',
                'height': '500px'
                }
                )
            ]),
    #html.H2("WordCloud"),
    #html.Img(src='data:image/png;base64,{}'.format(encoded_image)),

    #html.Img(src='/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/images/wordcloud.png',
    #    style={'width': '500px'})
    #html.Img(src='data:image/png;base64,{}'.format(encoded_image))
    html.Div([
        dcc.Dropdown(
            id='my-table-dropdown',
            options=[{'label': i, 'value': i}
            for i in ['All Comments', 'Positive', 'Negative', 'Neutral']
            ],value=None),
        html.Div(id='table-container')
        ],
            style={'width': '49%',
            'display': 'inline-block',
            'padding': '0 20'}
            ),
])

'''
-----------------------------------------------------------------
'''
'''
    html.Div([
        dcc.Graph(
            id='bubble',

            figure={
            'data': go.Scatter(
                x = most_freq_neu.index,
                y = -1.0,
                name="Neutral",
                mode='markers',
                marker=dict(
                    size=most_freq_neu.values,
                    color='#7FA6EE'))
            },

            style={
                'float': 'right',
                'width': '55.00%',
                'padding': '42px 0px 10px 10px',
                'height': '500px'
                }
            )
        ])
'''

# pie chart
@app.callback(
    dash.dependencies.Output('pie-graph', 'figure'),
    [dash.dependencies.Input('MyModel', 'value')])
def update_graph(MyModel):
    if MyModel == "All Models":
        values = [.2,.3,.5]
    else:
        values = list(Ratios[str(MyModel)])

    trace = go.Pie(labels=["Positive", "Negative","Neutral"], values=values, hole=.2,
        name='MyModel', hoverinfo='label+percent',
        textinfo='label + value',textfont=dict(size=20),
        marker=dict(colors= colors3))

    return {
        'data': [trace],
        'layout':
        go.Layout(
            title='Sentiment Ratios as Predicted by {}'.format(MyModel)
            )
    }

'''my_css_url = "https://github.com/adonovan7/YoutubeAnalysis/blob/master/dash/dash.css"
app.css.append_css({
    "external_url": my_css_url
})
'''

# table of comments
@app.callback(
    dash.dependencies.Output('table-container', 'children'),
    [dash.dependencies.Input('my-table-dropdown', 'value')])
def table_update(value):
    simple_df = data[["label","comment"]]
    selected = {"Positive": 1.0, "Neutral": 0.0, "Negative": -1.0}
    if value != "All Comments":
        filtered_df = simple_df[simple_df["label"]==selected.get(value)]
    else:
         filtered_df = simple_df
    return generate_table(filtered_df)

if __name__ == '__main__':
    app.run_server(debug=True)