# **Data Collection and Retrieval Function**

In [None]:
# Data Retrieval Function

def get_data(search_term):
  import pandas as pd
  import requests
  import json

  # Insert your own bearer token after setting up your Twitter Developer account
  bearer_token = ''
  headers = {'Authorization':('Bearer '+ bearer_token)}

  n = 550                          
  max_results = 100                 
  total_retrieved = 0            
  next_token = ""                   
  #search_term = "self driving"            
  #since_id = "100000"  

  # Create the empty DataFrame with the columns you want
  df = pd.DataFrame(columns=['id', 'retweets', 'likes', 'url', 'text', 'lang', 'source', 'sensitive'])
  df.set_index('id', inplace=True)

  # stop when we have n results
  while total_retrieved < n:

    # the first time through the loop, we do not need the next_token parameter
    if next_token == "":
      url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}'
    else:
      url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&next_token={next_token}'

    # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
    url += f'&tweet.fields=attachments,public_metrics,text,lang,source,possibly_sensitive'
    url += f'&expansions=attachments.media_keys'
    url += f'&media.fields=media_key,type,url'

    # make the request to the Twitter API Recent Search endpoint
    response = requests.request("GET", url, headers=headers)
    try:  # Just in case we get an error
      json_data = json.loads(response.text)
    except:
      print(response.text)
    

    for tweet in json_data["data"]:
      media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

      # Store the data into variables
      tweet_id = tweet['id']
      retweet_count = tweet['public_metrics']['retweet_count']
      like_count = tweet['public_metrics']['like_count']
      image_url = ""
      text = tweet['text']
      language = tweet['lang']
      source = tweet['source']
      sensitive = tweet['possibly_sensitive']

      # Find out if there is media
      if 'attachments' in tweet:
        if 'media_keys' in tweet['attachments']:
          media_key = tweet['attachments']['media_keys'][0]

      # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
      if media_key != "":
        for media in json_data['includes']['media']:
          if media['media_key'] == media_key: # Only if the media_key matches the one we stored
            if media['type'] == 'photo':      # Only if it is a photo; ignore videos
              image_url = media['url']        # Store the url in a variable
              
              # Only iterate if a photo is found
              total_retrieved += 1
              
              # Only add the record in the DataFrame if a photo is found
              df.loc[tweet_id] = [retweet_count, like_count, image_url, text, language, source, sensitive]
              break

    # keep track of where to start next time, but quit if there are no more results
    try:
      next_token = json_data['meta']['next_token']
    except:
      break  

  return df

# df = get_data('self driving')
# print(f'Number of records:\t{len(df)}')
# df.to_csv('twitter.csv')
# df.head()

  

# **Feature Cleaning Functions**

In [None]:
# Cleaning Functions

def bin_groups(df, percent=.05):
  import pandas as pd
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col])  and col not in ["url", "text", "source"]:
      for group, count in df[col].value_counts().iteritems():
        if count / len(df) < percent:
          df.loc[df[col] == group, col] = 'Other'
  return df

# Remove columns with more than 50% missing data
def drop_columns_missing_data(df, cutoff=.5):
  import pandas as pd
  for col in df:
    if df[col].isna().sum() / len(df) > cutoff:
      df.drop(columns=[col], inplace=True)
  return df

def impute_mean(df):
  from sklearn.impute import SimpleImputer
  import pandas as pd, numpy as np
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)
  imp = SimpleImputer(missing_values=np.nan, strategy='mean')
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
  return df

def impute_KNN(df):
  from sklearn.impute import KNNImputer
  from sklearn.preprocessing import MinMaxScaler
  import pandas as pd
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)
  df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns = df.columns)
  imp = KNNImputer(n_neighbors=5, weights="uniform")
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
  return df
          
def impute_reg(df):
  from sklearn.experimental import enable_iterative_imputer
  from sklearn.impute import IterativeImputer
  import pandas as pd
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)
  imp = IterativeImputer(max_iter=10, random_state=12345)
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
  return df

# ------------------------Additional Cleaning---------------------------------------------------------

#remove retweets, get only original tweets
def remove_retweets(df):
  #print(f'Total tweets: {len(df)}')
  df = df[~df['text'].str.contains("RT @")]
  #print(f'Original tweets: {len(df_originals)}')
  return df


def bin_source_label_groups(df): 
  #df_new = df_originals.copy()

  pd.set_option("display.max_rows", None, "display.max_columns", None)
  for row in df.itertuples():
    if row[6] in ['Twitter Web App', 'Twitter'] or "Twitter for " in row[6]:
      df.at[row[0], 'source'] = 'Direct to Twitter'
    elif "bot" in row[6].lower():
      df.at[row[0], 'source'] = 'Bot'
  #print(df.source.value_counts() / len(df))
  return df


def dummy_codes(df):
  df_dummies = df.copy()

  df["retweets"] = pd.to_numeric(df["retweets"])
  df["likes"] = pd.to_numeric(df["likes"])

  for col in df_dummies:
    if not pd.api.types.is_numeric_dtype(df_dummies[col]) and col not in ["url", "text", 'retweets', 'likes']:
      df_dummies = pd.get_dummies(df_dummies, columns=[col], drop_first=True)

  # print(df_dummies.shape)
  # df_dummies.head()
  return df_dummies


def get_sentiment(df):
  import nltk
  from nltk.sentiment import SentimentIntensityAnalyzer

  nltk.download('vader_lexicon')
  sia = SentimentIntensityAnalyzer()

  df_tweets = df.copy()
  df_tweets['sentiment_overall'] = 0.0
  df_tweets['sentiment_neg'] = 0.0
  df_tweets['sentiment_neu'] = 0.0
  df_tweets['sentiment_pos'] = 0.0

  for row in df_tweets.itertuples():
    sentiment = sia.polarity_scores(row[4])
    df_tweets.loc[row[0], 'sentiment_overall'] = sentiment['compound']
    df_tweets.loc[row[0], 'sentiment_neg'] = sentiment['neg']
    df_tweets.loc[row[0], 'sentiment_neu'] = sentiment['neu']
    df_tweets.loc[row[0], 'sentiment_pos'] = sentiment['pos']

  #df_tweets.head()
  return df_tweets


def image_cleaning(df): 
  # !pip install pytesseract
  # !sudo apt install tesseract-ocr

  from skimage.io import imread
  from PIL import Image
  import requests
  from io import BytesIO
  import cv2
  from pathlib import Path
  import numpy as np
  import pytesseract
  from pathlib import Path
  import re

  # Needed if using Colab
  from google.colab.patches import cv2_imshow

  df_eng = df.copy()
  df['num_faces'] = 0
  df['image_text'] = ""

  face_cascade = cv2.CascadeClassifier('/content/drive/MyDrive/Colab Notebooks/data/haarcascades/haarcascade_frontalface_default.xml')

  for row in df_eng.itertuples():
    response = requests.get(row[3])
    img = Image.open(BytesIO(response.content))
    img = img.save(row[3].partition("media/")[2])
    path = "/content/" + row[3].partition("media")[2]
    image = cv2.imread(path)
    grayimg = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(image=grayimg, scaleFactor=1.3, minNeighbors=5)
    #eyes = eye_cascade.detectMultiScale(image=gray, scaleFactor=1.05, minNeighbors=3)
    df_eng.loc[row[0], 'num_faces'] = len(faces)
    text = pytesseract.image_to_string(path)
    df_eng.loc[row[0], 'image_text'] = re.sub('\s+',' ', text)

  df_eng.head()
  return df

In [None]:
#df.isna().sum() / len(df)

# **Modeling Functions - Imports / Initialization**

In [None]:
# Modeling Functions 

def fs_variance(df, label="", p=0.8):
  from sklearn.feature_selection import VarianceThreshold
  import pandas as pd
  
  if label != "":
    X = df.drop(columns=[label])
      
  sel = VarianceThreshold(threshold=(p * (1 - p)))
  sel.fit_transform(X)
  
  # Add the label back in after removing poor features
  return df[sel.get_feature_names_out()].join(df[label])

def fit_mlr(df, test_size=.2, random_state=12345, label=''):
  from sklearn.linear_model import LinearRegression
  from sklearn.model_selection import train_test_split
  import pandas as pd

  numerical_df = df.select_dtypes(include=np.number)

  X = numerical_df.drop(label,axis=1)
  y = numerical_df[label]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
  model = LinearRegression().fit(X_train, y_train)
  print(f'R-squared (mlr): \t{model.score(X_test, y_test)}')
  return model

def fit_crossvalidate_mlr(df, k, label, repeat=True):
  from sklearn.linear_model import LinearRegression
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std

  #select only numberic columns
  df = df.select_dtypes(include=np.number)

  X = df.drop(label,axis=1)
  y = df[label]
  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)
  scores = cross_val_score(LinearRegression(), X, y, scoring='r2', cv=cv, n_jobs=-1)
  print(f'Average R-squared:\t{mean(scores)}')
  return LinearRegression().fit(X, y)


def fi_selectkbest(df, label, k=10):
  from sklearn.feature_selection import SelectKBest, r_regression
  y = df[label]
  X = df.drop(columns=[label])
  sel = SelectKBest(r_regression, k=k)
  sel.fit_transform(X, y)
  return df[sel.get_feature_names_out()].join(df[label])

def fi_select_linear(df, label):
  from sklearn.linear_model import LinearRegression
  from sklearn.feature_selection import SelectFromModel
  y = df[label]
  X = df.drop(columns=[label])

  model = LinearRegression().fit(X, y)
  sel = SelectFromModel(model, prefit=True)
  sel.transform(X)

  columns = list(X.columns[sel.get_support()])
  columns.append(label)

  return df[columns]

def fi_select_trees(df, label):
  from sklearn.ensemble import ExtraTreesClassifier
  from sklearn.feature_selection import SelectFromModel
  y = df[label]
  X = df.drop(columns=[label])

  model = ExtraTreesClassifier().fit(X, y)
  sel = SelectFromModel(model, prefit=True)
  sel.transform(X)

  columns = list(X.columns[sel.get_support()])
  columns.append(label)

  return df[columns]


# **Clustering Model**

In [None]:
def clustering_model(df, label):
  !pip install gower
  from sklearn.cluster import AgglomerativeClustering
  import  gower

  X = df.drop(columns=[label])
  y = df[label]

  distance_matrix = gower.gower_matrix(X)
  pd.DataFrame(distance_matrix).head()

  agg = AgglomerativeClustering(affinity="precomputed", linkage="average").fit(distance_matrix)
  df['agg_cluster'] = agg.labels_

  print(df.agg_cluster.value_counts(), '\n\n')
  return df

# **Output Functions**

In [None]:
# Output
def dump_pickle(model, file_name):
  import pickle
  pickle.dump(model, open(file_name, "wb"))

def load_pickle(file_name):
  import pickle
  model = pickle.load(open(file_name, "rb"))
  return model

# **Algorithm Selection Functions**

In [None]:
# Regression Model Algorithm Selection (5 algorithms for regression)

def fit_crossvalidate_reg(df, label, k=10, n=5, repeat=True):
  import sklearn.linear_model as lm, sklearn.ensemble as se
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std
  from xgboost import XGBRegressor

  X = df.drop(columns=[label])
  y = df[label]

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=n, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)

  fit = {}
  model = {}

  model_lr = lm.LinearRegression()
  model_ridge = lm.Ridge()
  #model_sgd = lm.SGDRegressor(max_iter=1000, tol=1e-3)
  model_lasso = lm.Lasso(alpha=0.1)
  model_ada = se.AdaBoostRegressor(random_state=12345, n_estimators=100)
  model_xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

  fit['MLR'] = mean(cross_val_score(model_lr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Ridge'] = mean(cross_val_score(model_ridge, X, y, scoring='r2', cv=cv, n_jobs=-1))
  #fit['SGD'] = mean(cross_val_score(model_sgd, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Lasso'] = mean(cross_val_score(model_lasso, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='r2', cv=cv, n_jobs=-1))

  model['MLR'] = model_lr
  model['Ridge'] = model_ridge
  #model['SGD'] = model_sgd
  model['Lasso'] = model_lasso
  model['AdaBoost'] = model_ada
  model['XGBoost'] = model_xgb

  df_fit = pd.DataFrame({'R-squared':fit})
  df_fit = df_fit.sort_values(by=['R-squared'], ascending=False)

  print(df_fit)

  best_model = df_fit.index[0]
  return model[best_model].fit(X, y)



In [None]:
# Classification Model Algorithm Selection (5 algorithms for classification)

def fit_crossvalidate_clf(df, label, k=10, n=5, repeat=True):
  import sklearn.linear_model as lm, sklearn.ensemble as se
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std
  from xgboost import XGBClassifier
  from sklearn.neighbors import KNeighborsClassifier
  from sklearn.naive_bayes import CategoricalNB

  X = df.drop(columns=[label])
  y = df[label]

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=n, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)

  fit = {}
  model = {}

  model_log = lm.LogisticRegression(max_iter=100)
  model_knn = KNeighborsClassifier(n_neighbors=3)
  #model_nb = CategoricalNB()
  model_ada = se.AdaBoostClassifier(n_estimators=100, random_state=12345)
  model_ext = se.ExtraTreesClassifier(n_estimators=100, random_state=12345)
  model_xgb = XGBClassifier()

  fit['Logistic'] = mean(cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['KNN'] = mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  #fit['NaiveBayes'] = mean(cross_val_score(model_nb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['ExtraTrees'] = mean(cross_val_score(model_ext, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))

  model['Logistic'] = model_log
  model['KNN'] = model_knn
  #model['NaiveBayes'] = model_nb
  model['AdaBoost'] = model_ada
  model['ExtraTrees'] = model_ext
  model['XGBoost'] = model_xgb

  df_fit = pd.DataFrame({'Accuracy':fit})
  df_fit = df_fit.sort_values(by=['Accuracy'], ascending=False)

  print(df_fit)

  best_model = df_fit.index[0]
  return model[best_model].fit(X, y)

# **Pipeline Function Calls**

In [None]:
# Regression Model Pipeline
# LABEL = likes

import pandas as pd
import numpy as np

# Data retrival pipeline
data = get_data('self driving')
df = data

# Data cleaning pipeline
df = bin_source_label_groups(df)
df = drop_columns_missing_data(df)
# df = impute_KNN(df)    #not currently applicable since we don't have missing data
df = remove_retweets(df)
df = dummy_codes(df)
df = get_sentiment(df)
df[['retweets', 'likes']] = df[['retweets', 'likes']].apply(pd.to_numeric) # converting "retweets" and "likes" datatype from an object to numeric
#df = image_cleaning(df)
numerical_df = df.select_dtypes(include=np.number) # including only numerical columns

df = fi_selectkbest(numerical_df, 'likes', 10) # feature importance
#df = fi_select_linear(numerical_df, 'likes')

#Modeling pipeline
#model = fit_crossvalidate_mlr(df, 10, 'likes')
model = fit_crossvalidate_reg(df, 'likes', 5, 2)
#clustering_model(df)

#print(df.shape)
#print(df.dtypes)
#df.head()

# Deployment pipeline
dump_pickle(model, 'Group_Project_Regression_Model.sav')

#Saved model
model = load_pickle('Group_Project_Regression_Model.sav')
model.predict(df.drop(columns=['likes']))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
          R-squared
Lasso     -0.412137
Ridge     -0.441895
MLR       -0.463956
AdaBoost  -1.140375
XGBoost   -1.868410


array([-2.71575065e+00, -1.66834272e+00, -3.85468656e+00,  9.90902801e+00,
       -1.52207412e+00, -5.04184336e+00,  4.09056288e+00,  5.45653553e-02,
        1.06682346e+00, -8.31235166e-01, -3.42069968e+00,  3.78567319e-01,
       -3.39210120e+00,  4.09056288e+00,  5.79669507e-02,  4.14803963e+01,
       -2.02434476e+00, -4.51913153e+00, -1.52207412e+00,  5.79669507e-02,
        1.83211826e+01,  4.09056288e+00, -1.52207412e+00,  9.70319987e+00,
       -3.42069968e+00,  5.75139184e+00, -3.92498620e+00,  4.68432679e+01,
       -1.52207412e+00, -1.52207412e+00,  1.04833307e+01, -1.52207412e+00,
       -3.74728446e+00, -2.54595434e+00, -2.65080525e+00,  2.78473597e+01,
       -3.42069968e+00,  1.53158369e+01, -3.81123531e-01,  3.16488202e-01,
       -3.81123531e-01, -1.52207412e+00, -3.81123531e-01, -3.81123531e-01,
       -1.52207412e+00, -1.09003988e+00,  1.16330383e+01,  7.80457432e+00,
       -4.77198346e+00, -3.78353652e+00,  2.88627359e+01,  9.81500107e-01,
       -3.39210120e+00, -

In [None]:
# Classification Model Pipeline 
# LABEL = source

import pandas as pd
import numpy as np

# Data retrival pipeline
#data = get_data('self driving')
df = data

# Data cleaning pipeline
df = bin_source_label_groups(df)
df = drop_columns_missing_data(df)
# df = impute_KNN(df)    #not currently applicable since we don't have missing data
df = remove_retweets(df)
df = get_sentiment(df)
df = dummy_codes(df.drop(columns=['source'])).join(df['source'])

df[['retweets', 'likes']] = df[['retweets', 'likes']].apply(pd.to_numeric) # converting "retweets" and "likes" datatype from an object to numeric
df = df.drop(columns=['url', 'text']) # droping string comlumns

#df = image_cleaning(df)
#numerical_df = df.select_dtypes(include=np.number) # including only numerical columns

df = fs_variance(df, 'source', p=.8) # feature importance
#df = fi_selectkbest(numerical_df, 'likes', 10) # feature importance
#df = fi_select_linear(numerical_df, 'likes')

#Modeling pipeline
model = fit_crossvalidate_clf(df, 'source', 5, 2)

#print(df.shape)
#print(df.dtypes)
df.head()

# Deployment pipeline
dump_pickle(model, 'Group_Project_Classification_Model.sav')

#Saved model
model = load_pickle('Group_Project_Classification_Model.sav')
model.predict(df.drop(columns=['source']))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
            Accuracy
XGBoost     0.470787
ExtraTrees  0.444944
Logistic    0.384270
AdaBoost    0.378652
KNN         0.350562


array(['Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'Direct to Twitter', 'Direct to Twitter', 'HubSpot', 'Echobox',
       'dlvr.it', 'dlvr.it', 'Direct to Twitter', 'Direct to Twitter',
       'dlvr.it', 'dlvr.it', 'dlvr.it', 'Direct to Twitter',
       'Direct to Twitter', 'dlvr.it', 'Direct to Twitter',
       'Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'Direct to Twitter', 'dlvr.it', 'dlvr.it', 'Direct to Twitter',
       'dlvr.it', 'tweeter_biases', 'dlvr.it', 'Direct to Twitter',
       'Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'dlvr.it', 'HubSpot', 'dlvr.it', 'dlvr.it', 'dlvr.it', 'dlvr.it',
       'dlvr.it', 'Direct to Twitter', 'The Social Jukebox', 'TweetDeck',
       'Direct to Twitter', 'Direct to Twitter', 'Direct to Twitter',
       'IFTTT', 'Echobox', 'Dir

In [None]:
# Clustering Model Pipeline 
# LABEL = source    (I don't know what the label should be for the clustering model)

import pandas as pd
import numpy as np

# Data retrival pipeline
#data = get_data('self driving')

# data = data.astype(str)
# data.dtypes.head()
df = data

# Data cleaning pipeline
df = bin_source_label_groups(df)
df = drop_columns_missing_data(df)
# df = impute_KNN(df)    #not currently applicable since we don't have missing data
df = remove_retweets(df)
df = get_sentiment(df)
df = dummy_codes(df.drop(columns=['source'])).join(df['source'])

#df[['retweets', 'likes']] = df[['retweets', 'likes']].apply(pd.to_numeric) # converting "retweets" and "likes" datatype from an object to numeric
df = df.drop(columns=['url', 'text']) # droping string comlumns

#df = image_cleaning(df)
#numerical_df = df.select_dtypes(include=np.number) # including only numerical columns

df = fs_variance(df, 'source', p=.8) # feature importance
#df = fi_selectkbest(numerical_df, 'likes', 10) # feature importance
#df = fi_select_linear(numerical_df, 'likes')


#Modeling pipeline
model = clustering_model(df, 'source')

#print(df.shape)
print(df.dtypes)
#df.head()

# Deployment pipeline
dump_pickle(model, 'Group_Project_Clustering_Model.sav')

#Saved model
model = load_pickle('Group_Project_Clustering_Model.sav')
#model.predict(df.drop(columns=['source']))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Collecting gower
  Downloading gower-0.0.5.tar.gz (4.4 kB)
Building wheels for collected packages: gower
  Building wheel for gower (setup.py) ... [?25l[?25hdone
  Created wheel for gower: filename=gower-0.0.5-py3-none-any.whl size=4231 sha256=85dd73f7992b757e66f485aef98fb318559afd9edb07d02542f2018996b08ea0
  Stored in directory: /root/.cache/pip/wheels/3e/f9/9a/67122a959a424e9cbb4557a8366c871a30e31cd75f0d003db4
Successfully built gower
Installing collected packages: gower
Successfully installed gower-0.0.5
0    443
1      2
Name: agg_cluster, dtype: int64 


retweets              object
likes                 object
sentiment_overall    float64
source                object
agg_cluster            int64
dtype: object


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')