In [None]:
import numpy as np
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.datasets import imdb
from keras.callbacks import EarlyStopping
from keras import regularizers

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Constants
MAX_WORDS = 10000
MAX_LEN = 10000
BATCH_SIZE = 32
EPOCHS = 50

In [None]:
def load_imdb_data():
    (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=MAX_WORDS)
    data = np.concatenate((training_data, testing_data), axis=0)
    targets = np.concatenate((training_targets, testing_targets), axis=0)
    return data, targets


In [None]:
def vectorize(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

data = vectorize(data)
targets = np.array(targets).astype("float32")

In [None]:
def vectorize_sequences(sequences):
    results = np.zeros((len(sequences), MAX_WORDS))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

In [None]:
def build_model():
    model = Sequential()
    model.add(Dense(50, activation='relu', input_shape=(MAX_WORDS,), kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.4))
    model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.3))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
def compile_and_train_model(model, train_x, train_y, test_x, test_y):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    results = model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE,
                        validation_data=(test_x, test_y), callbacks=[early_stopping])
    return results

In [None]:
def text_to_index(text, word_index):
    tokens = text.lower().split()
    indices = [word_index.get(token, 0) for token in tokens if word_index.get(token, 0) < MAX_WORDS]
    return indices[:MAX_LEN]

In [None]:
def preprocess_tweets(filepath):
    df = pd.read_csv(filepath)
    stop_words_set = set(stopwords.words('english'))

    def extract_hashtags(text):
        return re.findall(r"#(\w+)", text)

    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r"@\w+", "", text)
        text = re.sub(r"#\w+", "", text)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        tokens = text.split()
        tokens = [word for word in tokens if word not in stop_words_set]
        return ' '.join(tokens)

    df['hashtags'] = df['tweet'].apply(extract_hashtags)
    df['processed_tweet'] = df['tweet'].apply(preprocess_text)
    df.to_csv(filepath, index=False)

In [None]:
def sentiment_analysis(file_path, model, word_index):
    df = pd.read_csv(file_path)
    senti_scores = []

    for i in df["processed_tweet"]:
        input_indices = text_to_index(i, word_index)
        if input_indices:
            senti_scores.append(np.mean(model.predict(vectorize_sequences([input_indices]))))
        else:
            senti_scores.append(0.5)

    df["senti_score"] = senti_scores
    df.to_csv(file_path, index=False)


In [None]:
def main():
    data, targets = load_imdb_data()
    data = vectorize_sequences(data)
    targets = np.array(targets).astype("float32")
    test_x = data[:10000]
    test_y = targets[:10000]
    train_x = data[40000:]
    train_y = targets[40000:]

    model = build_model()
    results = compile_and_train_model(model, train_x, train_y, test_x, test_y)

    scores = model.evaluate(test_x, test_y, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1] * 100))

    folder_path = "/Users/anshulrana/Desktop/vsc/python/Predicting_Success_Of_Startups/twitter_Data/tweets_data"
    for i in os.listdir(folder_path):
        file_path = os.path.join(folder_path, i)
        if ".csv" in file_path:
            preprocess_tweets(file_path)
            sentiment_analysis(file_path, model, index)

if __name__ == "__main__":
    main()