In [4]:
import pandas as pd

# Load the dataset
file_path = 'out.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), 
#data.info()


(   Unnamed: 0       id        date  circumplex.arousal  circumplex.valence  \
 0           0  AS14.01  2014-02-26               -0.25                0.75   
 1           2  AS14.01  2014-03-21                0.20                0.20   
 2           3  AS14.01  2014-03-22                0.60                0.50   
 3           4  AS14.01  2014-03-23                0.20                0.80   
 4           5  AS14.01  2014-03-24                0.80                0.00   
 
    mood  appCat.builtin  appCat.communication  appCat.entertainment  \
 0  6.25           0.000                 0.000                 0.000   
 1  6.20        3139.218              6280.890              1007.456   
 2  6.40         731.429              4962.918                93.324   
 3  6.80        1286.246              5237.319                94.346   
 4  6.00         866.956              9270.629               976.971   
 
    appCat.finance  ...  appCat.office  appCat.other  appCat.social  \
 0           0.000 

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Convert 'date' to datetime
data['date'] = pd.to_datetime(data['date'])

date_counts = data['date'].value_counts()
id_counts = data['id'].value_counts()

# Convert the Series to a DataFrame for better visualization (optional)
date_counts_df = date_counts.reset_index()
date_counts_df.columns = ['Date', 'Count']

# Sort the DataFrame by the 'Date' column
sorted_date_counts_df = date_counts_df.sort_values(by='Date')

# Convert the Series to a DataFrame for better visualization (optional)
id_counts_df = id_counts.reset_index()
id_counts_df.columns = ['ID', 'Count']


# Fill missing 'screen' values with the median
data['screen'].fillna(data['screen'].median(), inplace=True)

# Select features and target
features = data.drop(['Unnamed: 0', 'id', 'date', 'mood'], axis=1)
target = data['mood']

sorted_date_counts_df.to_csv('date_counts_df.csv')
id_counts_df.to_csv('id_counts_df.csv')




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['screen'].fillna(data['screen'].median(), inplace=True)


In [20]:
import pandas as pd

# Load the dataset
data = pd.read_csv('out.csv')

# Convert 'date' to datetime if it's not already
data['date'] = pd.to_datetime(data['date'])

# Check if there's a row for every day for each id
pivot = data.pivot_table(index='date', columns='id', values='mood', aggfunc='count')

# Find missing entries
missing_entries = pivot.isnull()

# Print days and ids with missing data
print("Missing data for each id on each day:")
print(missing_entries)

# Optionally, check the number of unique days and ids
print("Number of unique days:", data['date'].nunique())
print("Number of unique ids:", data['id'].nunique())

missing_entries.to_csv('missing_entries.csv')


Missing data for each id on each day:
id          AS14.01  AS14.02  AS14.03  AS14.05  AS14.06  AS14.07  AS14.08  \
date                                                                        
2014-02-26    False     True     True     True     True     True     True   
2014-03-04     True     True     True     True     True     True     True   
2014-03-05     True     True     True     True     True     True     True   
2014-03-06     True     True     True     True     True     True    False   
2014-03-07     True     True     True     True     True     True    False   
...             ...      ...      ...      ...      ...      ...      ...   
2014-06-04     True     True     True     True     True     True     True   
2014-06-05     True     True     True     True     True     True     True   
2014-06-06     True     True     True     True     True     True     True   
2014-06-07     True     True     True     True     True     True     True   
2014-06-08     True     True     True 

In [23]:
import pandas as pd

# Load the dataset
data = pd.read_csv('out.csv')

# Convert 'date' to datetime and 'id' to a categorical type for better processing
data['date'] = pd.to_datetime(data['date'])
data['id'] = data['id'].astype('category')

# Create a pivot table to count entries for each date and id
pivot_table = data.pivot_table(index='date', columns='id', values='mood', aggfunc='count')

# Find IDs that have data for every day (no missing values in their column)
complete_data_ids = pivot_table.columns[~pivot_table.isnull().any()].tolist()

# Output the IDs
print("IDs with complete data for every day:", len(complete_data_ids))



IDs with complete data for every day: 27


  pivot_table = data.pivot_table(index='date', columns='id', values='mood', aggfunc='count')


In [None]:


# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


In [None]:
def create_sequences(features, target, sequence_length):
    X, y = [], []
    for i in range(len(features) - sequence_length):
        X.append(features[i: i + sequence_length])
        y.append(target[i + sequence_length])
    return np.array(X), np.array(y)

# Define sequence length
sequence_length = 7


# Reset the indices of y_train and y_test
y_train_reset = y_train.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)

# Create sequences for training and testing sets
X_train_seq, y_train_seq = create_sequences(X_train, y_train, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, sequence_length)

X_train_seq.shape, X_test_seq.shape




In [None]:
# Resetting indices on target arrays to align properly after shuffling
y_train_reset = y_train.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)

# Create sequences for training and testing sets
X_train_seq, y_train_seq = create_sequences(X_train, y_train_reset, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test_reset, sequence_length)

X_train_seq.shape, X_test_seq.shape


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Building the RNN model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(1)
])

# Compiling the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Display model summary
model.summary()

model.fit(X_train_seq, y_train_seq, epochs=30, batch_size=32, validation_data=(X_test_seq, y_test_seq))

