In [None]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [None]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')  
    yhat = model.predict_classes(encoded, verbose=0)
    out_word = ''    
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word == word
        break
    in_text += ' ' + out_word
  return in_text


        

In [None]:
data = """I kind of fell into data science by mistake.
Unlike most people nowadays, I had no idea about it until I was introduced to the field as an intern at Entrepreneurs RoundTable Accelerator and PCB:NG, the summer of 2017. My amazing bosses, Jonathan and Peter, were very interested in my personal and professional growth and wanted to make sure I was very much a part of the team. Unfortunately, I was unfamiliar with almost all of the core technology stack of the company and since Python was the only language out of the stack I knew, I was given a data analysis project and since then I developed an interest in data science and practically wrote all my Python code in Jupyter Notebooks.



My exploration of data science continued after my internship. I began spending the majority of my time learning MOOCs on data science and machine learning. I became so encapsulated in data science and machine learning that my social media feed, Google search history, YouTube video recommendations all became about data science and machine learning at a point.

This made me realize that the relevance of data science is in it's ability to solve real-world problems and not just the hype for hype sake.


At that time, I had begun creating content on LinkedIn and seeing a lot of advice on how to increase engagement on LinkedIn. For example, some people stated the best times to post to be on Tuesdays at 11 am. Some people said things like 'posts with pictures got the most engagement.' Others stated that tagging famous people in posts got a lot more views and creating video content was better than articles. As much as these pieces of advice were coming from a good place, none of it seemed to be based on data but more based on opinions and experience. And as a person who is data-driven, I decided to work on a project that analyzed the data of my LinkedIn content in order to provide insights.

In a sense, I was going to use my data science skills and experience to drive engagement of my LinkedIn content


Project: Data Analysis of LinkedIn content

Data Acquisition
Exploratory Analysis
Data Cleaning
Data Analysis
Data Visualization
Action
Results


Data Acquisition
This was the most important phase of the project. In order for this project to be successful, I needed data currency. Thankfully, I had been consistently producing content and so there was about more than a year's worth of data to analyze.

After clarifying my data source, it was necessary to mine that source. I could have written a Python script to scrape my LinkedIn content data but after careful research, I realized I could use Phantom Buster which is an online social media scraping tool.

No alt text provided for this image
I really like Phantom Buster because it has a lot of interesting APIs for social media particularly LinkedIn which I find very interesting. I used the LinkedIn Activities Extractor API to get the data of my posts. Since I had a lot of content, the process to a while to complete.

NB: It is important to follow all the instructions stated on the website in the right sequence in order to get the appropriate data set. Missing any step will result in loss of data.


Exploratory Analysis
After acquiring my data set in a '.csv' file, I took a first glimpse of what my data looked like in order to figure out what analysis could be performed.

The dataset had 8 columns and 253 rows. The columns were:

profileUrl - which was my LinkedIn web address where the posts were scrapped from
action - which stated whether a post was made on my profile or in a group
postContent - which was literally the content of each of my post down to even the spacing and punctuation.
likeCount - which indicated the number of likes each post garnered.
commentCount - which stated the number of comments on each post.
postDate - which stated how long the post has lasted and not necessarily the date of the post.
timestamp - stated the time each post was scrapped.
postUrl - which was the web address of the LinkedIn post


Data Cleaning
In a bid to clean up the data to make the analysis more efficient, I did the following:

The exploration of the data made me aware that there were columns that weren't necessary to provide insights. As a result, I deleted the postUrl, timestamp, and profileUrl columns in order to make the data cleaner.
In addition, I converted the data in the likeCount and commentCount columns from strings to integers so that they could be summed, averaged and aggregated.


Data Analysis
After exploring and cleaning up the data, I used the standard Python Pandas library to perform my initial analysis. Below are the stats obtained:

The total number of posts = 252
The total number of likes on posts = 8340
The total number of comments on posts = 727
The average number of likes on posts = 33
The average number of comments on posts = 2
The highest number of likes for a particular post = 513
The highest number of comments for a particular post = 51


A closer look at these stats made me want to analyze the texts used in my posts and so I began to delve into the content and utilized a bit of NLP to do the analysis. Below are the stats of the content of the texts:

The number of likes on posts that contain #365daysoflearning = 1820, which is about 1/8th of all the likes on my posts
The highest number of likes on a post that contains #365daysoflearning = 153
The highest number of comments on a post that contains #365daysoflearning = 14
The number of posts that contain the word python = 37
The number of posts that contain the word data = 41
Most used word = 'I'


Actions
These insights helped me make the decision to keep creating consistent content particularly posts that are related to technology which were the ones that received the most engagement.

The insights from this analysis also allowed me to refine the content I created allowing me to focus on telling the story of my journey in tech.

I began to also experiment more with video content which was the form that received the most engagement.



Results
The actions put in place after the insights increased my LinkedIn connections from around 3000 connections to 4185 currently and growing. This insights also made me aware of a need of people who love online learning which led me to create MOOC Anonymous group on LinkedIn.


Video Presentation of the Jupyter Notebook of the project.

Github Repo

In conclusion, if you're new to data science, gathering your own data and getting insights from it could be an excellent way to explore the skills you're learning and demonstrate your knowledge to potential employers"""

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [None]:
vocab_size = len(tokenizer.word_index)+1

In [None]:
sequences=list()
for line in data.split('.'):
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

In [None]:
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Max Sequence Length: 109
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 108, 10)           4150      
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 415)               21165     
Total params: 37,515
Trainable params: 37,515
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
35/35 - 0s - loss: 5.9012 - accuracy: 0.0659
Epoch 2/500
35/35 - 0s - loss: 5.3497 - accuracy: 0.0705
Epoch 3/500
35/35 - 0s - loss: 5.2215 - accuracy: 0.0705
Epoch 4/500
35/35 - 0s - loss: 5.2012 - accuracy: 0.0705
Epoch 5/500
35/35 - 0s - loss: 5.1937 - accuracy: 0.0705
Epoch 6/500
35/35 - 0s - loss: 5.1863 - accuracy: 0.0705
Epoch 7/50

<tensorflow.python.keras.callbacks.History at 0x7f2dc45dd6d8>

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
from keras.models import model_from_json
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [None]:
import keras
print(keras.__version__)

2.4.3
