In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
df = pd.read_excel('Labeled_Financial_Projection.xlsx')

In [3]:
df.head()

Unnamed: 0,message,label
0,"""Alice (Project Manager)"", ""Hey Bob, did you h...",0
1,"""Bob (Financial Analyst)"", ""Yeah, I did. It’s ...",1
2,"""Alice"", ""Sure thing, I’ll get that done today...",0
3,"""Bob"", ""I’ve heard great things about it but h...",1
4,"""Alice"", ""Sounds good to me. The product devel...",0


In [4]:
df.shape

(378, 2)

In [5]:
df.isnull().any()

message    False
label      False
dtype: bool

In [6]:
df.isna().any()

message    False
label      False
dtype: bool

In [7]:
df.isna().sum()

message    0
label      0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.dtypes

message    object
label       int64
dtype: object

In [10]:
df.describe()

Unnamed: 0,label
count,378.0
mean,0.518519
std,0.500319
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [11]:
X = df['message']
y = df['label']

In [12]:
df['label'].value_counts()

label
1    196
0    182
Name: count, dtype: int64

Algorithms expects numerical input, but the input data contains text (strings). We need to convert the text data into a numerical format before feeding it into the model. This process is known as feature extraction or text vectorization. We can use techniques like TF-IDF or Count Vectorizer to transform the text data into numerical vectors and then train the model

In [13]:
import re, string
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word.casefold() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(tokens)


In [14]:
df['preprocessed_text'] = df['message'].apply(preprocess_text)
df.head()

Unnamed: 0,message,label,preprocessed_text
0,"""Alice (Project Manager)"", ""Hey Bob, did you h...",0,Alice Project Manager Hey Bob hear new product...
1,"""Bob (Financial Analyst)"", ""Yeah, I did. It’s ...",1,Bob Financial Analyst Yeah really exciting Im ...
2,"""Alice"", ""Sure thing, I’ll get that done today...",0,Alice Sure thing Ill get done today Speaking t...
3,"""Bob"", ""I’ve heard great things about it but h...",1,Bob Ive heard great thing havent chance go yet...
4,"""Alice"", ""Sounds good to me. The product devel...",0,Alice Sounds good product development team fin...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3)).fit(df['preprocessed_text'])
x = vectorizer.fit_transform(X)
x

<378x6284 sparse matrix of type '<class 'numpy.int64'>'
	with 13308 stored elements in Compressed Sparse Row format>

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [17]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
log_reg_pred_values = log_reg_model.predict(X_test)
accuracy = accuracy_score(log_reg_pred_values, y_test)
print(f'logistic regression accuarcy_score: {accuracy*100}')

logistic regression accuarcy_score: 85.52631578947368


In [18]:
print(classification_report(y_test, log_reg_pred_values))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86        43
           1       0.79      0.91      0.85        33

    accuracy                           0.86        76
   macro avg       0.86      0.86      0.85        76
weighted avg       0.86      0.86      0.86        76



In [19]:
conf_matrix = confusion_matrix(y_test, log_reg_pred_values)
print("Confusion Matrix of logistic regression:")
print(conf_matrix)

Confusion Matrix of logistic regression:
[[35  8]
 [ 3 30]]


In [20]:
# Computing SVM
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)
pred_values = svm_model.predict(X_test)
accuracy = accuracy_score(pred_values, y_test)
print(f'SVM accuarcy_score: {accuracy*100}')

SVM accuarcy_score: 78.94736842105263


In [21]:
print(classification_report(y_test, pred_values))

              precision    recall  f1-score   support

           0       1.00      0.63      0.77        43
           1       0.67      1.00      0.80        33

    accuracy                           0.79        76
   macro avg       0.84      0.81      0.79        76
weighted avg       0.86      0.79      0.79        76



In [22]:
conf_matrix = confusion_matrix(y_test, pred_values)
print("Confusion Matrix of SVM:")
print(conf_matrix)

Confusion Matrix of SVM:
[[27 16]
 [ 0 33]]


In [23]:
# Computing SVM using Linear Kernel
from sklearn.svm import SVC
svm_linear = SVC(kernel='linear', C= 1.0, random_state = 42)
svm_linear.fit(X_train, y_train)
svm_linear_pred = svm_linear.predict(X_test)
accuracy = accuracy_score(svm_linear_pred, y_test)
print(f'svm_linear accuarcy_score: {accuracy*100}')

svm_linear accuarcy_score: 88.1578947368421


In [24]:
print(classification_report(y_test, svm_linear_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89        43
           1       0.83      0.91      0.87        33

    accuracy                           0.88        76
   macro avg       0.88      0.88      0.88        76
weighted avg       0.89      0.88      0.88        76



In [25]:
conf_matrix = confusion_matrix(y_test, svm_linear_pred)
print("Confusion Matrix of SVM using Linear Kernel:")
print(conf_matrix)

Confusion Matrix of SVM using Linear Kernel:
[[37  6]
 [ 3 30]]


In [26]:
# Computing SVM using rbf Kernel
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_rbf.fit(X_train, y_train)
svm_rbf_pred = svm_rbf.predict(X_test)
accuracy = accuracy_score(svm_rbf_pred, y_test)
print(f'svm_rbf accuarcy_score: {accuracy*100}')

svm_rbf accuarcy_score: 78.94736842105263


In [27]:
print(classification_report(y_test, svm_rbf_pred))

              precision    recall  f1-score   support

           0       1.00      0.63      0.77        43
           1       0.67      1.00      0.80        33

    accuracy                           0.79        76
   macro avg       0.84      0.81      0.79        76
weighted avg       0.86      0.79      0.79        76



In [28]:
conf_matrix = confusion_matrix(y_test, svm_rbf_pred)
print("Confusion Matrix of SVM using rbf Kernel:")
print(conf_matrix)

Confusion Matrix of SVM using rbf Kernel:
[[27 16]
 [ 0 33]]


In [29]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred_values = rf_model.predict(X_test)
accuracy = accuracy_score(rf_pred_values, y_test)
print(f'Random Forest accuarcy_score: {accuracy*100}')

Random Forest accuarcy_score: 72.36842105263158


In [30]:
print(classification_report(y_test, rf_pred_values))

              precision    recall  f1-score   support

           0       1.00      0.51      0.68        43
           1       0.61      1.00      0.76        33

    accuracy                           0.72        76
   macro avg       0.81      0.76      0.72        76
weighted avg       0.83      0.72      0.71        76



In [31]:
conf_matrix = confusion_matrix(y_test, rf_pred_values)
print("Confusion Matrix of Random Forest:")
print(conf_matrix)

Confusion Matrix of Random Forest:
[[22 21]
 [ 0 33]]


In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred_values = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, dt_pred_values)
print(f"Decision Tree Accuracy: {accuracy*100}")

Decision Tree Accuracy: 78.94736842105263


In [34]:
conf_matrix = confusion_matrix(y_test, dt_pred_values)
print("Confusion Matrix of decision tree:")
print(conf_matrix)

Confusion Matrix of decision tree:
[[34  9]
 [ 7 26]]


In [35]:
class_report = classification_report(y_test, dt_pred_values)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81        43
           1       0.74      0.79      0.76        33

    accuracy                           0.79        76
   macro avg       0.79      0.79      0.79        76
weighted avg       0.79      0.79      0.79        76



In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
param_grid = {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga', 'newton-cg', 'lbfgs']}
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_}')

Best parameters: {'C': 100, 'solver': 'lbfgs'}
Best cross-validation accuracy: 0.8213661202185794


In [38]:
param_grid = {
	'n_estimators': [25, 50, 100, 150],
	'max_features': ['sqrt', 'log2', None],
	'max_depth': [3, 6, 9],
	'max_leaf_nodes': [3, 6, 9],
}
rf_grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid)
rf_grid_search.fit(X_train, y_train)
print(f'Best parameters: {rf_grid_search.best_params_}')
print(f'Best cross-validation accuracy: {rf_grid_search.best_score_}')

Best parameters: {'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 150}
Best cross-validation accuracy: 0.7715300546448087


In [39]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear']}
SVM_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
SVM_grid.fit(X_train, y_train)
print(f'Best parameters: {SVM_grid.best_params_}')
print(f'Best cross-validation accuracy: {SVM_grid.best_score_}')

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.721 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.852 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.867 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.750 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.883 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.721 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.852 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.867 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.750 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.883 total time=   0.0s
[CV 1/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.721 total time=   0.0s
[CV 2/5] END ..C=0.1, gamma=0.01, kernel=linear

In [40]:
testing_data = ["The revenue for Q1 increased by 20%.",
    "Our net income this quarter is impressive.",
    "The financial report indicates a strong growth trend.",
    "Let's grab lunch together tomorrow.",
    "The financial report indicates a strong growth trend.",
    "I will be on vacation next week.",
    "We should plan a team-building activity.",
    "Our investment in new technology has paid off.",
    "Let's schedule a meeting for next Monday.",
    "I need your feedback on the document I sent.",
    "The product launch was a huge success.",
    "We need to reduce our expenses to improve our profit margin.",
    "Let's grab lunch together tomorrow."]

In [51]:
def financial_projection_detection(input_text):
    processed_text = [preprocess_text(input_text)]
    text = vectorizer.transform(processed_text)
    prediction = grid_search.predict(text)
    pred_result = 'Financial Data Detected' if prediction == 1 else 'No Financial Data Detected'
    label = 1 if prediction == 1 else 0
    result = {
        "Text": input_text,
        "Prediction": pred_result,
        "label": label,
        "logistic regression": classification_report(y_test, log_reg_pred_values)
    }
    return result

In [52]:
results = []

for input_text in testing_data:
    result = financial_projection_detection(input_text)
    results.append(result)

# Print or process the results as needed
for result in results:
    print(result)

{'Text': 'The revenue for Q1 increased by 20%.', 'Prediction': 'Financial Data Detected', 'label': 1, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}
{'Text': 'Our net income this quarter is impressive.', 'Prediction': 'Financial Data Detected', 'label': 1, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}
{'Text': 'The financial report indicates a strong growth trend.', 'P

In [54]:
import json, os
desired_directory = '/Users/indrakiranreddy/Desktop/Financial_Projection_Detection'
file_name = 'Financial_projection_detection.json'
file_path = os.path.join(desired_directory, file_name)
os.makedirs(desired_directory, exist_ok=True)

if os.path.isfile(file_path) and os.path.getsize(file_path) > 0:
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
else:
    data = []

# Function to add a new item to the in-memory list
def add_item(new_item):
    global data
    data.append(new_item)
    save_data()

# Function to save the in-memory list to the JSON file
def save_data():
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Data has been saved to {file_path}")

for result in results:
    add_item(result)

print(f"Data has been written to {file_path}")

Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/indrakiranreddy/Desktop/Financial_Projection_Detection/Financial_projection_detection.json
Data has been saved to /Users/in

In [55]:
data

[{'Text': 'The revenue for Q1 increased by 20%.',
  'Prediction': 'Financial Data Detected',
  'label': 1,
  'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'},
 {'Text': 'Our net income this quarter is impressive.',
  'Prediction': 'Financial Data Detected',
  'label': 1,
  'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'},
 {'Text': 'The financial report indicates a strong g

In [56]:
with open(file_path, 'r') as json_file:
    written_data = json.load(json_file)
    print(f"Written data: {written_data}")

Written data: [{'Text': 'The revenue for Q1 increased by 20%.', 'Prediction': 'Financial Data Detected', 'label': 1, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}, {'Text': 'Our net income this quarter is impressive.', 'Prediction': 'Financial Data Detected', 'label': 1, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}, {'Text': 'The financial report indicates a strong g

In [57]:
pip install pymongo



In [58]:
from urllib.parse import quote_plus
from pymongo import MongoClient

username = "ibonthu"
password = "@Reddy007"

# Escape username and password
escaped_username = quote_plus(username)
escaped_password = quote_plus(password)

# MongoDB connection string with escaped username and password
connection_string = f"mongodb+srv://{escaped_username}:{escaped_password}@cluster0.ds38g3h.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Use the connection string to connect to MongoDB
client = MongoClient(connection_string)

In [59]:
my_db = client["Eitacies"]

# Create a collection
my_collection = my_db["Financial_projection_detection"]
for result in results:
  if '_id' in result:
    del result['_id']
  result = my_collection.insert_one(result)
  print(f"Inserted document ID: {result.inserted_id}")

Inserted document ID: 6675208b617b6f1ade89b401
Inserted document ID: 6675208c617b6f1ade89b402
Inserted document ID: 6675208c617b6f1ade89b403
Inserted document ID: 6675208c617b6f1ade89b404
Inserted document ID: 6675208c617b6f1ade89b405
Inserted document ID: 6675208c617b6f1ade89b406
Inserted document ID: 6675208c617b6f1ade89b407
Inserted document ID: 6675208c617b6f1ade89b408
Inserted document ID: 6675208c617b6f1ade89b409
Inserted document ID: 6675208c617b6f1ade89b40a
Inserted document ID: 6675208c617b6f1ade89b40b
Inserted document ID: 6675208c617b6f1ade89b40c
Inserted document ID: 6675208c617b6f1ade89b40d


In [234]:
from datetime import date
def financial_projection_detection(input_text):
    processed_text = [preprocess_text(input_text)]
    text = vectorizer.transform(processed_text)
    prediction = SVM_grid.predict(text)
    pred_result = 'Financial Data Detected' if prediction == 1 else 'No Financial Data Detected'
    label = 1 if prediction == 1 else 0
    result = {
        "Text": input_text,
        "Prediction": pred_result,
        "label": label,
        "logistic regression": classification_report(y_test, log_reg_pred_values)

    }
    return result

In [235]:
log = []

for input_text in testing_data:
    result = financial_projection_detection(input_text)
    log.append(result)

# Print or process the results as needed
for result in log:
    print(result)

{'Text': 'What are your plans for the weekend?', 'Prediction': 'No Financial Data Detected', 'label': 0, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}
{'Text': "I'm considering investing in some blue-chip stocks.", 'Prediction': 'Financial Data Detected', 'label': 1, 'logistic regression': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86        43\n           1       0.79      0.91      0.85        33\n\n    accuracy                           0.86        76\n   macro avg       0.86      0.86      0.85        76\nweighted avg       0.86      0.86      0.86        76\n'}
{'Text': 'I’m thinking of going for a hike in the mounta